diff --git "a/app/src/content/article.mdx" "b/app/src/content/article.mdx"
--- "a/app/src/content/article.mdx"
+++ "b/app/src/content/article.mdx"
@@ -71,20 +71,13 @@ affiliations:
url: 'https://huggingface.co'
published: 'Oct. 30, 2025'
-doi: 10.1234/abcd.efgh
-licence: >
- Diagrams and text are licensed under CC‑BY 4.0 with the source available on Hugging Face, unless noted
- otherwise. Figures reused from other sources are excluded and marked in their
- captions (“Figure from …”).
tags:
- - research
- - template
+ - research-article-template
+ - scientific paper
+ - data visualization
tableOfContentsAutoCollapse: true
+seoThumbImage: /thumb.png
pdfProOnly: true
---
@@ -110,11 +103,12 @@ import Screenshot_2025_10_24_at_09_37_24_2961384e_bcac_8055_9e8e_ffbd3a1aa368 fr
import Screenshot_2025_09_26_at_22_36_40_27a1384e_bcac_8063_94e0_f1c689e7d9b9 from './assets/image/Screenshot_2025-09-26_at_22_36_40_27a1384e-bcac-8063-94e0-f1c689e7d9b9.png';
import GtU8DnoWsAAruEG_28e1384e_bcac_8051_8122_ed6cacf8f632 from './assets/image/GtU8DnoWsAAruEG_28e1384e-bcac-8051-8122-ed6cacf8f632.png';
import Screenshot_2025_10_01_at_11_31_19_28e1384e_bcac_8005_8c5e_f0af3bf70372 from './assets/image/Screenshot_2025-10-01_at_11_31_19_28e1384e-bcac-8005-8c5e-f0af3bf70372.png';
+import Screenshot_2025_10_30_at_13_02_36_29c1384e_bcac_80d6_a72d_ff34bc221b60 from './assets/image/Screenshot_2025-10-30_at_13_02_36_29c1384e-bcac-80d6-a72d-ff34bc221b60.png';
import image_2881384e_bcac_80d6_84fe_d705cb1eae0a from './assets/image/image_2881384e-bcac-80d6-84fe-d705cb1eae0a.png';
import image_2881384e_bcac_801d_9f3d_c875181b9dd1 from './assets/image/image_2881384e-bcac-801d-9f3d-c875181b9dd1.png';
import image_2881384e_bcac_80ed_9bdf_c077977d77b8 from './assets/image/image_2881384e-bcac-80ed-9bdf-c077977d77b8.png';
import h100_dgx_2891384e_bcac_80cf_9f86_ccf0653a79e5 from './assets/image/h100_dgx_2891384e-bcac-80cf-9f86-ccf0653a79e5.gif';
-import lstopo_2951384e_bcac_808f_a7c5_c244e7ac69db from './assets/image/lstopo_2951384e-bcac-808f-a7c5-c244e7ac69db.jpg';
+import lstopo_29c1384e_bcac_80c9_9715_cbfe9e73d86b from './assets/image/lstopo_29c1384e-bcac-80c9-9715-cbfe9e73d86b.jpg';
import image_2891384e_bcac_80e2_9cc5_c2c46c7ab39b from './assets/image/image_2891384e-bcac-80e2-9cc5-c2c46c7ab39b.png';
import image_27d1384e_bcac_80b1_9ffb_ec29d0021ccc from './assets/image/image_27d1384e-bcac-80b1-9ffb-ec29d0021ccc.png';
@@ -127,15 +121,15 @@ What does it actually take to train a high-performance LLM today?
Reading time: 2-4 days.
-Published research makes it look straightforward: strategic architecture choices, carefully curated datasets, and sufficient compute. The results are polished, the ablations are structured and clean. Every decision seems obvious in hindsight. But these technical reports only show what worked and apply a bit of rosy retrospection – they don't capture the 2am dataloader debugging sessions, the loss spikes, or the subtle tensor parallelism bug (see later!) that quietly sabotages your training. The reality is messier, more iterative, and full of decisions that don't make it into the final technical report.
+Published research makes it look straightforward: strategic architecture choices, carefully curated datasets, and sufficient compute. The results are polished, the ablations are structured and clean. Every decision seems obvious in hindsight. But those reports only show what worked and apply a bit of rosy retrospection – they don't capture the 2am dataloader debugging sessions, the loss spikes, or the subtle tensor parallelism bug (see later!) that quietly sabotages your training. The reality is messier, more iterative, and full of decisions that don't make it into the final paper.
Join us as we look behind the scenes of training [SmolLM3](https://huggingface.co/HuggingFaceTB/SmolLM3-3B), a 3B multilingual reasoning model trained on 11T tokens. This is not an ordinary blog post, but rather the untangling of a spiderweb of decisions, discoveries, and dead ends that led to deep insights into what it takes to build world-class language models.
-It is also the final opus in our model-training long-form series: we've worked through building datasets at scale ([FineWeb](https://huggingface.co/spaces/HuggingFaceFW/blogpost-fineweb-v1)), orchestrating thousands of GPUs to sing in unison ([Ultra Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook)), and selecting the best evaluations at each step of the process ([Evaluation Guidebook](https://github.com/huggingface/evaluation-guidebook)). Now we shape it all together to build a strong AI model. We'll walk you through the complete journey – not just the final recipe that worked, but showcase how experiments fail, infrastructure breaks, and how debugging processes shaped our decisions.
+It is also the final opus in our model-training long-form series: we've worked through building datasets at scale ([FineWeb](https://huggingface.co/spaces/HuggingFaceFW/blogpost-fineweb-v1)), orchestrating thousands of GPUs to sing in unison ([Ultra Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook)), and selecting the best evaluations at each step of the process ([Evaluation Guidebook](https://github.com/huggingface/evaluation-guidebook)). Now we shape it all together to build a strong AI model. We'll walk you through the complete journey – not just the final recipe that worked, but the failures, infrastructure breakdowns, and debugging processes that shaped every decision.
The story reads like a drama: you'll see how promising small-scale ablations sometimes don't translate at scale, why we restarted a training after 1T tokens, how we balanced the competing objectives of multilinguality, math, and code while maintaining strong English performance, and finally how we post-trained a hybrid reasoning model.
-We also tried to avoid a cold list of all we did in favour of an organized story through our adventure. Think of this as a guide for anyone trying to go from "we have a great dataset and GPUs" to "we built a really strong model". We hope being this open will helps close the gap between research and production, and make your next training run a little less chaotic.
+We also tried to avoid a cold list of all we did in favour of an organized story through our adventure. Think of this as a guide for anyone trying to go from "we have a great dataset and GPUs" to "we built a really strong model". We hope being this open will help close the gap between research and production, and make your next training run a little less chaotic.
### How to read this blog post
@@ -161,7 +155,7 @@ If you have questions or remarks, open a discussion on the
@@ -170,7 +164,7 @@ Which raises an uncomfortable truth: maybe you *don't need to train your own mod
This might seem like an odd way to start an "LLM training guide". But many failed training projects didn't fail because of bad hyperparameters or buggy code, they failed because someone decided to train a model they didn't need. So before you commit to training, and dive into *how* to execute it, you need to answer two questions: *why* are you training this model? And *what* model should you train? Without clear answers, you'll waste months of compute and engineering time building something the world already has, or worse, something nobody needs.
-Let's start with the why, because without understanding your purpose, you can't make coherent decisions about anything that follows.
+Let's start with the *why* , because without understanding your purpose, you can't make coherent decisions about anything that follows.
@@ -195,48 +189,15 @@ The allure of "we trained our own model" is powerful, but before investing a lot
The flowchart below guides the thought process one should go through before starting a big pretraining project. From a technical perspective, you should essentially first find out if there isn't an existing model that you can either prompt of fine-tune to do the job.
-```mermaid
-flowchart TD
- A["Should you train your own model?"] --> B["Can existing models handle your use case?"]
-
- B --> C["YES
- Existing models work well just with prompting"]
- B --> D["NO
- Prompting isn't enough"]
-
- C --> E["❌
- Don't train. Use existing models"]
-
- D --> F["Can finetuning solve your problem?"]
-
- F --> G["YES
- Finetuning works (post-training/continual pretraining)"]
- F --> H["NO
- Finetuning cannot solve your problem"]
-
- G --> I["❌
- Don't train from scratch"]
-
- H --> J["Train a model under one of these categories"]
-
- J --> K["🔬
- Research"]
- J --> L["🏭
- Production"]
- J --> N["🌐
- Strategic Open-Source"]
-
- classDef question fill:#ffd0c5
- classDef decision fill:#f9f9f9
- classDef success fill:#d1f2eb
- classDef danger fill:#f8d7da
- classDef category fill:#ffb9b7
-
- class A,B,F question
- class C,D,G,H decision
- class E,I success
- class J,K,L,N category
-```
+
+
+
+
+
+
+The "why" we discuss is about training from scratch. We don't cover distillation or pruning in this blog. These are valid paths to efficient models but represent different workflows than training from scratch. We recommend NVIDIA's Minitron paper for an overview of these topics.
+
+
There are essentially three common areas where custom pretraining can make sense: you want to do novel research, you have very specific needs for production use-case, or you want to fill a gap in the open model ecosystem. Let's have a quick look at each:
#### **Research: what do you want to understand?**
@@ -245,7 +206,7 @@ There is plenty of research one can do in the LLM space. What LLM research proje
- Can we scale training on this new optimiser to a 10B+ model? From [Muon is Scalable for LLM Training ](https://huggingface.co/papers/2502.16982)
- Can reinforcement learning alone, without SFT, produce reasoning capabilities? From [DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning](https://huggingface.co/papers/2501.12948)
-- Can we train good small models on purely synthetic textbooks data? From [Textbooks Are All You Need ](/b349da53329a408f83c48c1f7ab8d6f0)
+- Can we train good small models on purely synthetic textbooks data? From [Textbooks Are All You Need ](https://huggingface.co/papers/2306.11644)
- Can we achieve competitive performance by training on only openly licensed data? From[ The Common Pile v0.1: An 8TB Dataset of Public Domain and Openly Licensed Text ](https://huggingface.co/papers/2506.05209)
Making the hypothesis as concrete as possible and thinking about the necessary experiment scale increases the chance of success.
@@ -264,7 +225,7 @@ A second, related reason is deployment constraints: when you need a model tailor
Here's a simple test: spend a few days building on top of Qwen3, Gemma3, or another current SOTA model. Can you reach your performance goals through prompting, tool-use, or post-training? If not, it's probably time to train your own.
-Even if the post-training budget needed to meet your requirements is immense, it might still be cheaper than starting from scratch. Post-training your model for 1T tokens is still more economic than starting from scratch to train for 10T+ tokens.
+Even if the post-training budget needed to meet your requirements is immense, it might still be cheaper than starting from scratch. Fine-tuning your model for 1T tokens is still more economic than starting from scratch to train for 10T+ tokens.
@@ -286,11 +247,6 @@ You have reasons to believe you can do better; perhaps you've curated better tra
This is a real goal and success creates value: developers adopt your model, it becomes infrastructure for others, or it establishes technical credibility. But success requires experience. You need to know what's actually feasible and how to execute reliably in a competitive space. To make this concrete, let's look at how we think about this question at Hugging Face.
-
-
-We also don't cover distillation or pruning in this blog. These are valid paths to efficient models but represent different workflows than training from scratch. We recommend NVIDIA's Minitron paper for an overview of these topics.
-
-
#### **Hugging Face's journey**
So why does Hugging Face train open models? The answer is simple: we build things that are useful to the open-source ecosystem and fill gaps that few others are filling.
@@ -318,7 +274,7 @@ you can find an overview here
+
-A change is de-risked when testing shows it either improves performance
+A change is derisked when testing shows it either improves performance
on your target capabilities, or provides a meaningful benefit
(e.g. faster inference, lower memory, better stability) without hurting
performance beyond your acceptable tradeoffs.
@@ -428,11 +384,11 @@ performance beyond your acceptable tradeoffs.
The tricky part is that your baseline and training setup have many components you could modify: attention mechanisms, positional encodings, activation functions, optimisers, training hyperparameters, normalisation schemes, model layout, and more. Each represents a potential experiment, and these components often interact in non-linear ways. You have neither the time nor compute to test everything or explore every interaction.
-Start by testing the promising changes individually to understand their isolated impact, then combine the ones that work and run a leave-one-out analysis if your compute budget allows for it.
+Start by testing promising changes against your current baseline. When something works, integrate it to create a new baseline, then test the next change against that. If your compute budget allows it you could test changes individually and run a leave-one-out analysis.
-Check out the ScaleRL paper [@scalerl] for an excellent example of this methodology in practice.
+Check out the ScaleRL paper [@scalerl] for an example of this methodology in practice.
Don't fall into the trap of exhaustive grid searches over every hyperparameter or testing every architectural variant that comes out.
@@ -447,13 +403,13 @@ Ask yourself two questions before testing any modification:
If a modification doesn't clearly address either question, skip it.
-Now that you know how to identify what's promising through strategic planning, it's time to move to the **empirical validation** . In the next sections, we'll show you *how* to actually test these changes in practice. We'll cover how to set up reliable experiments, interpret results, and avoid common pitfalls. Then in the following chapters, we'll walk through concrete examples of testing popular architectural, data, infra and training decisions.
+Now that you know how to identify what's promising through strategic planning, it's time to move to the **empirical validation.** In the next sections, we'll show you *how* to actually test these changes in practice. We'll cover how to set up reliable experiments, interpret results, and avoid common pitfalls. Then in the following chapters, we'll walk through concrete examples of testing popular architectural, data, infra and training decisions.
So let's build a simple ablation setup we can use for our experiments. First, we need to decide which training framework to pick.
### Picking a training framework
-The first decision we need to make is which framework to use for training our model, and by extension, for running all our ablations. This choice involves balancing three key considerations that, frustratingly, will work against each other:
+The first decision we need to make is which framework to use for training our model, and by extension, for running all our ablations. This choice involves balancing three key considerations:
@@ -476,11 +432,11 @@ In practice, these requirements might pull against each other, creating trade-of
| **Nanotron** | 🎯 Minimal, tailored for HF pretraining | ✅ Yes (StarCoder, SmolLM) | ✅ Optimised (UltraScale Playbook) | 15k / 66k | ⚡ Moderate: requires parallelism know-how |
-The table above summarises the key trade-offs between popular frameworks. Lines of code for the first three frameworks are from the TorchTitan technical report [@torchtitan] Let's discuss each in more detail:
+The table above summarises the key trade-offs between popular frameworks. Lines of code for the first three frameworks are from the TorchTitan technical report [@torchtitan]. Let's discuss each in more detail:
-[Megatron-LM](https://github.com/NVIDIA/Megatron-LM) from Nvidia has been around for years and is battle-tested. It's what powers models like Kimi's K2 [@kimik2], it delivers solid throughput and has most of the production features we'd want. But that maturity comes with complexity: the codebase can be hard to navigate and modify when we need to implement something new.
+[Megatron-LM](https://github.com/NVIDIA/Megatron-LM) from Nvidia has been around for years and is battle-tested. It's what powers models like Kimi's K2 [@kimik2], it delivers solid throughput and has most of the production features we'd want. But that maturity comes with complexity: the codebase can be hard to navigate and modify when you're new to it.
-[DeepSpeed](https://github.com/deepspeedai/DeepSpeed) falls into a similar category. It's the pioneer of ZeRO optimisation and powered models like BLOOM and GLM. Like Megatron-LM, it's extensively battle-tested and optimised, but shares the same complexity challenges. The large codebase (194k total lines) can be intimidating when you need to implement custom features or debug unexpected behavior.
+[DeepSpeed](https://github.com/deepspeedai/DeepSpeed) falls into a similar category. It's the pioneer of ZeRO optimisation and powered models like BLOOM and GLM. Like Megatron-LM, it's extensively battle-tested and optimised, but shares the same complexity challenges. The large codebase (194k total lines) can be intimidating when you're getting started, particularly for implementing custom features or debugging unexpected behavior.
On the other side, PyTorch's recent [TorchTitan](https://github.com/pytorch/torchtitan) library is much lighter and simpler to navigate, thanks to its compact and modular codebase. It has the core features needed for pretraining and is great for rapid experimentation. However, being newer, it isn't as battle-tested and can still be a bit unstable as it's actively developed.
@@ -494,15 +450,18 @@ If multiple frameworks support your needs, compare their throughput on your spec
### Ablation setup
+With our framework chosen, we now need to design our ablation setup. We need experiments that are fast enough to iterate on quickly, but large enough that the results give us signal and transfer to the final model. Let's see how to set this up.
+
#### Setting up our ablation framework
-Now that we've chosen a framework, we need to decide on our ablation setup. Remember, the goal is to run experiments at a small scale and get results we can confidently extrapolate to our final production run.
+The goal of ablations is to run experiments at a small scale and get results we can confidently extrapolate to our final production run.
There are two main approaches. First, we can take our target model size and train it on fewer tokens. For the SmolLM3 ablations, we trained the full 3B model on 100B tokens instead of the final 11T. Second, if our target model is too large, we can train a smaller proxy model for ablations. For example, when Kimi was developing their 1T parameter Kimi K2 model with 32B active parameters, using the full size for all ablations would have been prohibitively expensive, so they ran some ablations on a 3B MoE with 0.5B active parameters [@kimik2].
-One key question is whether these small-scale findings actually transfer. In our experience, if something hurts performance at small scale, you can confidently rule it out for large scale. Now something works at small scale, you should still make sure you've trained on a reasonable number of tokens to conclude with high probability that these findings will extrapolate to larger scales. The longer you train and the closer the ablation models are to the final model, the better.
+One key question is whether these small-scale findings actually transfer. In our experience, if something hurts performance at small scale, you can confidently rule it out for large scale. But if something works at small scale, you should still make sure you've trained on a reasonable number of tokens to conclude with high probability that these findings will extrapolate to larger scales. The longer you train and the closer the ablation models are to the final model, the better.
-In this blog post, we'll use a baseline vanilla transformer for all ablations. Our main setup is a 1B transformer following [Llama3.2 1B](https://huggingface.co/meta-llama/Llama-3.2-1B) architecture trained on 45B tokens. This takes about 1.5 days to train on a node of 8xH100s using this nanotron [config](https://huggingface.co/datasets/HuggingFaceTB/ablations-training-configs/blob/main/baseline_config_1B.yaml) (42k tokens per second per GPU). For experiments needing stronger signal, we'll also show results from our larger setup: the 3B model trained on 100B tokens that we used for SmolLM3. You can find the 3B baseline config [here](https://huggingface.co/datasets/HuggingFaceTB/ablations-training-configs/blob/main/baseline_config_3B.yaml).
+In this blog post, we'll use a baseline vanilla transformer for all ablations. Our main setup is a 1B transformer following [Llama3.2 1B](https://huggingface.co/meta-llama/Llama-3.2-1B) architecture trained on 45B tokens. This takes about 1.5 days to train on a node of 8xH100s using this nanotron [config](https://huggingface.co/datasets/HuggingFaceTB/training-guide-nanotron-configs/blob/main/baseline_config_1B.yaml) (42k tokens per second per GPU). During SmolLM3 training, we ran these ablations on a 3B model trained on 100B tokens
+(config [here](https://huggingface.co/datasets/HuggingFaceTB/training-guide-nanotron-configs).). We'll share those results at the end of each chapter (you'll see that the conclusions align).
@@ -576,7 +535,7 @@ tokens:
...(truncated)
```
-For our ablations, we'll modify the first 3 sections while keeping everything else constant.
+For our ablations, we'll modify different sections depending on what we're testing while keeping everything else constant: the `model` section for [architecture choices](#architecture-choices), the `optimizer` section for [optimizer and training hyperparameters](#optimiser-and-training-hyperparameters), and the `data_stages` section for [data curation](#the-art-of-data-curation).
@@ -641,23 +600,23 @@ For these ablations, it's good to focus on tasks that give good early signal and
The quality of a task also depends on the task formulation (how we ask the model questions) and metric choice (how we compute the answer score).
-Three common task formulations are multiple choice format (MCF), cloze formulation (CF) and freeform generation (FG). Multiple choice format requires models to select an option from a number of choices explicitly presented in the prompt and prefixed with A/B/C/D (as is done in MMLU, for example). In cloze formulation, we compare the likelihood of the difference choices to see which one is more likely without having provided them in the prompt. In FG, we look at the accuracy of the greedy generation for a given prompt. FG requires a lot of latent knowledge in the model and is usually too difficult a task for the models to be really useful in short pre-training ablations before a full of training. We thus focus on multiple choice formulations when running small sized ablations (MCF or CF).
+Three common task formulations are multiple choice format (MCF), cloze formulation (CF) and freeform generation (FG). Multiple choice format requires models to select an option from a number of choices explicitly presented in the prompt and prefixed with A/B/C/D (as is done in MMLU, for example). In cloze formulation, we compare the likelihood of the different choices to see which one is more likely without having provided them in the prompt. In FG, we look at the accuracy of the greedy generation for a given prompt. FG requires a lot of latent knowledge in the model and is usually too difficult a task for the models to be really useful in short pre-training ablations before a full of training. We thus focus on multiple choice formulations when running small sized ablations (MCF or CF).
For post-trained models, FG becomes the primary formulation since
we're evaluating whether the model can actually generate useful responses.
-We'll cover evaluation for these models in the post-training chapter.
+We'll cover evaluation for these models in the [post-training chapter](#beyond-base-models--post-training-in-2025).
-Research has also shown that models struggle with MCF early in training, only learning this skill after extensive training, making CF better for early signal [@olmes, @du2025, @datacomp]. We thus use CF for small ablations, and integrate MCF in the main run as it gives better mid-training signal once a model has passed a threshold to get sufficiently high signal-over-noise ratio for MCF. A quick note also that, to score a model's answer in sequence likelihood evaluations like CF, we compute accuracy as the percentage of questions where the the correct answer has the highest log probability normalised by character count. This normalisation prevents a bias toward shorter answers.
+Research has also shown that models struggle with MCF early in training, only learning this skill after extensive training, making CF better for early signal [@olmes; @du2025; @datacomp]. We thus use CF for small ablations, and integrate MCF in the main run as it gives better mid-training signal once a model has passed a threshold to get sufficiently high signal-over-noise ratio for MCF. A quick note also that, to score a model's answer in sequence likelihood evaluations like CF, we compute accuracy as the percentage of questions where the the correct answer has the highest log probability normalised by character count. This normalisation prevents a bias toward shorter answers.
-The point at which MMLU MCF becomes non-random depends on the model size and training data. For a 7B transformer, [t](https://arxiv.org/pdf/2406.08446)he OLMES paper [@olmes] found the model starts showing non-random performance after 500B tokens. For 1.7B model, we found this happens after 6T tokens in SmolLM2. @du2025 argue this is fundamentally about the pre-training loss reaching a certain threshold.
+The point at which MMLU MCF becomes non-random depends on the model size and training data. For a 7B transformer, the OLMES paper [@olmes] found the model starts showing non-random performance after 500B tokens. For 1.7B model, we found this happens after 6T tokens in SmolLM2 [@smollm2]. @du2025 argue this is fundamentally about the pre-training loss reaching a certain threshold.
-Our ablations evaluation suite includes the benchmarks from [FineWeb](https://huggingface.co/spaces/HuggingFaceFW/blogpost-fineweb-v1) ablations, except for SIQA which we found to be too noisy. We add math and code benchmarks like GSM8K and HumanEval and a long context benchmark RULER for long context ablations. This aggregation of tasks test world knowledge, reasoning, and common sense across a variety of formats, as shown in the table below. To speed up evaluations at the expense of some additional noise, we only evaluate on 1,000 questions from each benchmark. We also use the cloze fomulation (CF) way of evaluating for all multiple-choice benchmarks, as explained above. Note that for multilingual ablations and actual training, we add more benchmarks to test multilinguality, which we detail later. These evaluations are run using LightEval and the individual benchmarks are covered in more detail in the [Appendix](#appendix). The table above summarises the key characteristics of each benchmark:
+Our ablations evaluation suite includes the benchmarks from [FineWeb](https://huggingface.co/spaces/HuggingFaceFW/blogpost-fineweb-v1) ablations, except for SIQA which we found to be too noisy. We add math and code benchmarks like GSM8K and HumanEval and the long context benchmark RULER for long context ablations. This aggregation of tasks test world knowledge, reasoning, and common sense across a variety of formats, as shown in the table below. To speed up evaluations at the expense of some additional noise, we only evaluate on 1,000 questions from each benchmark (except for GSM8K, HumanEval & RULER, which we used in full for the 3B SmolLM3 ablations but omit from the 1B experiments below). We also use the cloze fomulation (CF) way of evaluating for all multiple-choice benchmarks, as explained above. Note that for multilingual ablations and actual training, we add more benchmarks to test multilinguality, which we detail later. These evaluations are run using [LightEval](https://github.com/huggingface/lighteval) and the table below summarises the key characteristics of each benchmark:
| Benchmark | Domain | Task Type | Questions | What it Tests |
| --- | --- | --- | --- | --- |
@@ -747,7 +706,7 @@ Now that we have our experimental framework in place, it's time to make the big
Remember the [training compass](#training-compass-why--what--how): before making any technical choices, we need clarity on the *why* and *what* . Why are we training this model, and what should it look like?
-It sounds obvious, but as we explained in the Training Compass, being deliberate here shapes our decisions and keeps us from getting lost in the endless space of possible experiments. Are we aiming for a SOTA model in English? Is long context a priority? Or a we trying to validate a new architecture? The training loop may look similar in all these cases, but the experiments we run and the trade-offs we accept will be different. Answering this question early helps us decide how to balance our time between data and architecture work, and how much to innovate in each before starting the run.
+It sounds obvious, but as we explained in the training compass, being deliberate here shapes our decisions and keeps us from getting lost in the endless space of possible experiments. Are we aiming for a SOTA model in English? Is long context a priority? Or a we trying to validate a new architecture? The training loop may look similar in all these cases, but the experiments we run and the trade-offs we accept will be different. Answering this question early helps us decide how to balance our time between data and architecture work, and how much to innovate in each before starting the run.
So, let's lead by example and walk through the goals that guided SmolLM3's design. We wanted a strong model for on-device applications with competitive multilingual performance, solid math and coding capabilities, and robust long context handling. As we mentioned earlier, this led us to a dense model with 3B parameters: large enough for strong capabilities but small enough to fit comfortably on phones. We went with a dense transformer rather than MoE or Hybrid given the memory constraints of edge devices and our project timeline (roughly 3 months).
@@ -829,7 +788,7 @@ Note that the leading factor of 2 comes from storing both key and value caches.
The natural question to ask is: do we really need new KV values for each head? Probably not and both Multi-Query Attention (MQA) [@mqa] and Grouped Query Attention (GQA) [@gqa] address this. The simplest case is to share the KV values across all heads, thus dividing the size of the KV cache by $n_{heads}$ , which is e.g. a 64 decrease for Llama 3 70B! This is the idea of MQA and was used in some models like StarCoder as an alternative to MHA. However, we might give away a bit more attention capacity than we are willing to, so we could consider the middle ground and share the KV values across groups of heads e.g. 4 heads sharing the same KV values. This is the GQA approach and strikes a middle ground between MQA and MHA.
-More recently, DeepSeek-v2 (and also used in v3) introduced *Multi-Latent Attention (MLA) [@deepseekv2]* , which uses a different strategy to compress the cache: rather than reducing the number KV-values it reduces their size and simply stores a latent variable which can be decompressed into KV values at runtime. With this approach they managed to reduce the cache to an equivalent of GQA with 2.25 groups while giving stronger performance than MHA! In order to make this work with RoPE, a small tweak with an extra small latent vector is needed. In DeepSeek-v2 they chose $4*dim_{head}$ for the main latent variable and $1/2*dim_{head}$ for the RoPE part so a total fo $4.5*dim_{head}$ which is used for both K and V simultaneously thus dropping the leading factor of 2.
+More recently, DeepSeek-v2 (and also used in v3) introduced *Multi-Latent Attention (MLA)* [@deepseekv2], which uses a different strategy to compress the cache: rather than reducing the number KV-values it reduces their size and simply stores a latent variable which can be decompressed into KV values at runtime. With this approach they managed to reduce the cache to an equivalent of GQA with 2.25 groups while giving stronger performance than MHA! In order to make this work with RoPE, a small tweak with an extra small latent vector is needed. In DeepSeek-v2 they chose $4*dim_{head}$ for the main latent variable and $1/2*dim_{head}$ for the RoPE part so a total fo $4.5*dim_{head}$ which is used for both K and V simultaneously thus dropping the leading factor of 2.
@@ -1035,7 +994,7 @@ smoothingWindow: 15
-We don't observe any impact on short context tasks, similar to Llama3. However, document masking becomes crucial when scaling to long sequences to speed up the training. This is particularly important for our long context extension, where we scale from 4k to 64k tokens (detailed in the [Training marathon](#training-marathon) chapter). We therefore adopted it for SmolLM3 throughout the full training run.
+Similar to Llama3, we don't observe a noticeable impact on short context tasks, except for a small improvement on PIQA. However, document masking becomes crucial when scaling to long sequences to speed up the training. This is particularly important for our long context extension, where we scale from 4k to 64k tokens (detailed in the [Training marathon](#the-training-marathon) chapter). We therefore adopted it for SmolLM3 throughout the full training run.
We've covered in this section how attention processes sequences. Now let's look at another major parameter block in transformers: the embeddings.
@@ -1087,18 +1046,18 @@ charts: [
{ title: "WinoGrande", metric: "winogrande" }
],
smoothing: true,
-smoothingWindow: 15
+smoothingWindow: 5
}}
/>
-The loss and evaluation results demonstrate that our baseline 1.2B model with tied embeddings achieves comparable performance to the 1.46B untied equivalent, despite having 18% fewer parameters. The 1.2B model with untied embeddings and reduced layers (12 vs 16) underperforms both configurations, exhibiting higher loss and lower downstream evaluation scores. This suggests that increasing model depth provides greater benefits than untying embeddings at equivalent parameter budgets.
+The loss and evaluation results demonstrate that our baseline 1.2B model with tied embeddings achieves comparable performance to the 1.46B untied equivalent, on all the benchmarks except for WinoGrande, despite having 18% fewer parameters. The 1.2B model with untied embeddings and reduced layers (12 vs 16) underperforms both configurations, exhibiting higher loss and lower downstream evaluation scores. This suggests that increasing model depth provides greater benefits than untying embeddings at equivalent parameter budgets.
Based on these results, we kept tied embeddings for our SmolLM3 3B model.
-We've now explored the main attention mechanisms and their tradeoffs. But attention alone does not capture the order of tokens in a sequence; providing this information is the role of positional encodings. In the next section, we will look at how positional encoding strategies have evolved, from standard RoPE to newer approaches like NoPE (No Position Embedding), which enable more effective modeling for long contexts.
+We've now explored the embedding sharing strategy and its tradeoffs. But embeddings alone don't capture the order of tokens in a sequence; providing this information is the role of positional encodings. In the next section, we will look at how positional encoding strategies have evolved, from standard RoPE to newer approaches like NoPE (No Position Embedding), which enable more effective modeling for long contexts.
#### **Positional Encodings & Long Context**
@@ -1108,7 +1067,7 @@ The solution is positional embeddings: mathematical encodings that give each tok
**The Evolution of Position Encoding**
-Early transformers used simple **Absolute Position Embeddings (APE)** [@transformer **]** , essentially learned lookup tables that mapped each position (1, 2, 3...) to a vector that gets added to token embeddings. This worked fine for short sequences but had a major limitation: models max input sequence length was limited to the max input sequence length they were trained on. They had no out-of-the-box generalisation capabilities to longer sequences.
+Early transformers used simple **Absolute Position Embeddings (APE)** [@transformer], essentially learned lookup tables that mapped each position (1, 2, 3...) to a vector that gets added to token embeddings. This worked fine for short sequences but had a major limitation: models max input sequence length was limited to the max input sequence length they were trained on. They had no out-of-the-box generalisation capabilities to longer sequences.
The field evolved toward **relative position encodings** that capture the distance between tokens rather than their absolute positions. This makes intuitive sense, whether two words are 3 positions apart matters more than whether they're at positions (5,8) versus (105,108).
@@ -1119,7 +1078,7 @@ For a deeper dive into positional encoding, [this blog](https://huggingface.co/b
**ALiBi** (Attention with Linear Biases) [@alibi], in particular, modifies the attention scores based on token distance. The further apart two tokens are, the more their attention gets penalized through simple linear biases applied to attention weights. For a detailed implementation of Alibi, check this [resource](https://nn.labml.ai/transformers/alibi/index.html).
-But the technique that has dominated recent large language models is **Rotary Position Embedding (RoPE)** [@rope].
+But the technique that has dominated recent large language models is Rotary Position Embedding (RoPE) [@rope].
**RoPE: Position as Rotation**
@@ -1205,7 +1164,7 @@ The attention pattern depends only on (m-n), so tokens that are 5 positions apar
**How to set RoPE Frequency?**
-In practice, most LLM pretraining starts with relatively short context lengths (2K-4K tokens) using RoPE base frequencies of a few tens thousands like 10K or 50K. Training with very long sequences from the start would be computationally expensive due to attention's quadratic scaling with sequence length and the limited availability of long-context data (samples > 4K context length) as we've seen before in the document masking section of [Attention](#attention). Research also suggests it can hurt short-context performance [@skyladder]. Models typically start by learning short range correlation between words so long sequences don't help much. The typical approach is to do most pretraining with shorter sequences, then do continual pretraining or spend the final few hundred billion tokens on longer sequences. However, as sequence lengths grow, the rotation angles which are proportional to token positions, grow and can cause attention scores for distant tokens to decay too rapidly [@xiong2023effectivelongcontextscalingfoundation; rozière2024codellamaopenfoundation]:
+In practice, most LLM pretraining starts with relatively short context lengths (2K-4K tokens) using RoPE base frequencies of a few tens thousands like 10K or 50K. Training with very long sequences from the start would be computationally expensive due to attention's quadratic scaling with sequence length and the limited availability of long-context data (samples > 4K context length) as we've seen before in the document masking section of [Attention](#attention). Research also suggests it can hurt short-context performance [@skyladder]. Models typically start by learning short range correlation between words so long sequences don't help much. The typical approach is to do most pretraining with shorter sequences, then do continual pretraining or spend the final few hundred billion tokens on longer sequences. However, as sequence lengths grow, the rotation angles which are proportional to token positions, grow and can cause attention scores for distant tokens to decay too rapidly [@xiong2023effectivelongcontextscalingfoundation; @rozière2024codellamaopenfoundation]:
```python
θ = position x 1 / (base^(k/(dim/2)))
@@ -1265,7 +1224,7 @@ charts: [
{ title: "WinoGrande", metric: "winogrande" }
],
smoothing: true,
-smoothingWindow: 15
+smoothingWindow: 5
}}
/>
@@ -1332,7 +1291,7 @@ But having the right architecture is only half the battle. Even well-designed mo
Let's now turn to one of the biggest challenges in LLM pretraining: instabilities. Often manifesting as loss spikes or sudden jumps in training loss, these issues become especially common at scale.
-While we'll dive deeper into the different types of spikes and how to handle them in the [Training Marathon](#training-marathon) section (diving in floating point precision, optimizers and learning rate), certain architectural and training techniques can also help us reduce instability so let's take a moment to study them here. We'll cover a few simple techniques used in recent large-scale training runs (e.g., Olmo2 [@olmo2] and Qwen3 [@qwen3]) to improve stability: Z-loss, removing weight decay from embeddings, and QK-norm.
+While we'll dive deeper into the different types of spikes and how to handle them in the [Training Marathon](#the-training-marathon) section (diving in floating point precision, optimizers and learning rate), certain architectural and training techniques can also help us reduce instability so let's take a moment to study them here. We'll cover a few simple techniques used in recent large-scale training runs (e.g., Olmo2 [@olmo2] and Qwen3 [@qwen3]) to improve stability: Z-loss, removing weight decay from embeddings, and QK-norm.
**Z-loss**
@@ -1380,7 +1339,7 @@ smoothingWindow: 15
-The ablation results below on our 1B model show that adding z-loss doesn't impact the training loss or downstream performance. For SmolLM3, we ended up not using it because our implementation introduced some training overhead that we didn't optimized by the time we started training.
+The ablation results below on our 1B model show that adding Z-loss doesn't impact the training loss or downstream performance. For SmolLM3, we ended up not using it because our Z-loss implementation introduced some training overhead that we didn't optimize by the time we started training.
**Removing weight decay from embeddings**
@@ -1432,17 +1391,15 @@ QK-norm [@dehghani2023scalingvisiontransformers22] applies layer normalization t
However, @rnope found that QK-norm hurts long-context tasks. Their analysis revealed that QK-norm results in lower attention mass on relevant tokens (needles) and higher attention mass on irrelevant context. They argue this occurs because the normalization operation removes magnitude information from the query-key dot product, which makes the attention logits closer in terms of magnitude. Due to this reason, we didn't use QK-norm in SmolLM3. Additionally, as a small 3B parameter model, it faces less risk of training instability compared to the larger models where QK-norm has proven most beneficial.
-We've now covered the core building blocks of the transformer: attention mechanisms, positional encodings, embeddings, and stability techniques. Next, we'll discuss broader architectural decisions like choosing between dense models, sparse MoEs, or Hybrid architectures. After that, we'll end with the tokenizer, a crucial component in LLM training.
-
#### Other core components
Beyond the components we've covered, there are a couple other architectural decisions worth noting for completeness.
-To initialize parameters, modern models typically use truncated normal initialization (mean=0, std=0.02 or std=0.006) or initialization scheme like muP (Cohere, grok) that can be another topic of ablations.
+To initialize parameters, modern models typically use truncated normal initialization (mean=0, std=0.02 or std=0.006) or initialization scheme like muP [@mup], for instance Cohere's Command A [@commandacohere]. This could be another topic of ablations.
-In terms of **activation functions** , SwiGLU has become a de facto standard in modern LLMs (except Gemma2 using GeGLU and nvidia using relu^2 [@nvidia2025nvidianemotronnano2; nvidia2024nemotron4340btechnicalreport]), replacing older choices like ReLU or GELU.
+In terms of activation functions, SwiGLU has become a de facto standard in modern LLMs (except Gemma2 using GeGLU and nvidia using relu^2 [@nvidia2025nvidianemotronnano2; @nvidia2024nemotron4340btechnicalreport]), replacing older choices like ReLU or GELU.
-At a broader scale, architectural layout choices also play a role in shaping model behavior. Although the total parameter count largely determines a language model's capacity, how those parameters are distributed across depth and width also matters. [Petty et al](https://arxiv.org/abs/2310.19956). found that deeper models outperform equally sized wider ones on language-modeling and compositional tasks until the benefit saturates. This "deep-and-thin" strategy works well for sub-billion-parameter LLMs in MobileLLM ablations [@mobilellm], whereas wider models tend to offer faster inference thanks to greater parallelism. Modern architectures reflect this trade-off differently as noted in this[ blog post](https://sebastianraschka.com/blog/2025/the-big-llm-architecture-comparison.html).
+At a broader scale, architectural layout choices also play a role in shaping model behavior. Although the total parameter count largely determines a language model's capacity, how those parameters are distributed across depth and width also matters. @petty2024impactdepthcompositionalgeneralization found that deeper models outperform equally sized wider ones on language-modeling and compositional tasks until the benefit saturates. This "deep-and-thin" strategy works well for sub-billion-parameter LLMs in MobileLLM ablations [@mobilellm], whereas wider models tend to offer faster inference thanks to greater parallelism. Modern architectures reflect this trade-off differently as noted in this[ blog post](https://sebastianraschka.com/blog/2025/the-big-llm-architecture-comparison.html).
We now covered the most important aspects of the dense transformer architecture worth optimizing for you training run. However, recently other architecture interventions that concern the model as a whole have emerged, namely MoE and hybrid models. Let's have a look what they have to offer, starting with the MoEs.
@@ -1546,7 +1503,9 @@ $$
A higher granularity value corresponds to having more experts with smaller dimension (given a fixed number of parameters). This metric is a ratio between the expert dimension ( $d_{expert}$ ) and the model dimension ( $d_{model}$ ).
-In dense models, a common rule of thumb is to have the dimension of the MLP set to $d_{intermediate} = 4 * d_{model}$ . If $\alpha = 4$ (like @krajewski2024scalinglawsfinegrainedmixture) you can loosely view granularity as **how many experts it would take to match the dense MLP width (** $4\, d_{\text{model}} = d_{\text{intermediate}} = G\, d_{\text{expert}}$ **)** . That interpretation is only a rough heuristic: modern MoE designs often allocate much larger total capacity than a single dense MLP, so the one-to-one match breaks down in practice. The Ant team setup choose $\alpha = 2$ which is simply a **different normalization choice** . For consistency we will pick this convention and stick to it.
+In dense models, a common rule of thumb is to have the dimension of the MLP set to $d_{intermediate} = 4 * d_{model}$ . If $\alpha = 4$ (like @krajewski2024scalinglawsfinegrainedmixture). You can loosely view granularity as **how many experts it would take to match the dense MLP width** ( $4\, d_{\text{model}} = d_{\text{intermediate}} = G\, d_{\text{expert}}$ ).
+
+This interpretation is only a rough heuristic: modern MoE designs often allocate much larger total capacity than a single dense MLP, so the one-to-one match breaks down in practice. The Ant team setup choose $\alpha = 2$ which is simply a different normalization choice. For consistency we will pick this convention and stick to it.
@@ -1612,7 +1571,7 @@ It is also possible to achieve balancing without an explicit loss term. DeepSeek
-A key detail is the scope at which you compute routing statistics: are $f_i$ and $P_i$ computed per local batch (each worker's mini-batch) or globally (aggregated across workers/devices)? The Qwen team's analysis [@qiu2025demonsdetailimplementingload] shows that when there isn't enough token diversity in each local batch and that local computation can hurt both expert specialization (a good proxy for routing health) and overall model performance. Expert specialization is the phenomenon where one or more experts are activated more often than others for a specific domain. In other words, if a local batch is narrow, its routing stats become noisy/biased, and don't lead to good balancing. This implies that we should use global statistics (or at least cross-device aggregation) whenever feasible. Notably, at the time of that paper, many frameworks—including Megatron—computed these statistics locally by default.
+A key detail is the scope at which you compute routing statistics: are $f_i$ and $P_i$ computed per local batch (each worker's mini-batch) or globally (aggregated across workers/devices)? Qwen team's analysis [@qiu2025demonsdetailimplementingload] shows that when there isn't enough token diversity in each local batch and that local computation can hurt both expert specialization (a good proxy for routing health) and overall model performance. Expert specialization is the phenomenon where one or more experts are activated more often than others for a specific domain. In other words, if a local batch is narrow, its routing stats become noisy/biased, and don't lead to good balancing. This implies that we should use global statistics (or at least cross-device aggregation) whenever feasible. Notably, at the time of that paper, many frameworks—including Megatron—computed these statistics locally by default.
The following plot from Qwen's paper illustrates the difference of micro-batch vs global batch aggregation and it's impact on performance and specialization:
@@ -1721,7 +1680,9 @@ What's interesting here is that on retrieval tasks like Needle in a Haystack (NI
-Surprisingly, the recently released MiniMax M2 does not use hybrid or linear Attention. According to their lead NLP researcher, they found it [hurts performance on reasoning-related tasks](https://x.com/zpysky1125/status/1982847594926911984). This highlights the complexity of architecture ablations. We'll update this section once MiniMax releases their technical report with detailed ablations.
+Surprisingly, the recently released MiniMax M2 does not use hybrid or linear attention. According to their pretraining lead, while their early MiniMax M1 experiments with Lightning Attention looked promising at smaller scales on the popular benchmarks at the time (MMLU, BBH, MATH), they found it had "clear deficits in complex, multi-hop reasoning tasks" at larger scales. They also cite numerical precision issues during RL training and infrastructure maturity as key blockers. They conclude that making architecture at scale is a multivariable problem that is hard and compute intensive due to the sensitivity to other parameters like data distribution, optimizer...
+
+However, they acknowledge that "as GPU compute growth slows while data length keeps increasing, the benefits of linear and sparse attention will gradually emerge." This highlights both the complexity of architecture ablations and the gap between research and production reality.
Now let's have a look at some more of these methods and how they can be understood with a unified framework.
@@ -1757,13 +1718,9 @@ Almost all recent linear attention methods have this gating component with just
-One notable variant is Mamba-2 [@mamba2] on the list. It's used in many of the hybrid models like Nemotron-H, Falcon H1, and Granite-4.0-h [@nemotronh; @falconh1; @granite4]. Overall, hybrid models are moving quickly and are a solid choice for frontier training. Qwen3-Next (with a gated DeltaNet update) [@qwen3next] reports they are:
-
-- faster at inference for long context (the primary goal)
-- faster to train
-- and stronger on benchmarks
+One notable variant is Mamba-2 [@mamba2] on the list. It's used in many of the hybrid models like Nemotron-H [@nemotronh], Falcon H1 [@falconh1], and Granite-4.0-h [@granite4] .
-However, it's a bit too early for definitive consensus but they are definitely a choice to carefully consider for your next models!
+However, it's still early days and there's important nuance to consider when scaling to large hybrid models. While they show promise, MiniMax's experience with [M2](https://x.com/zpysky1125/status/1983383094607347992) highlights that benefits at small scale don't always translate to large-scale production systems, particularly for complex reasoning tasks, RL training stability, and infrastructure maturity. That said, hybrid models are moving quickly and remain a solid choice for frontier training. Qwen3-Next (with a gated DeltaNet update) [@qwen3next] reports they are faster at inference for long context, faster to train, and stronger on usual benchmarks. We are also looking forward for Kimi's next model that will most likely use their new ["Kimi Delta Attention](https://github.com/fla-org/flash-linear-attention/pull/621)["](https://github.com/fla-org/flash-linear-attention/pull/621). Let's also mention Sparse Attention which solves the same issue for long context as linear attention by selecting block or query to compute attention. Some examples are Native Sparse Attention [@nsa], DeepSeek Sparse Attention [@dsa] and InfLLM v2 [@minicpm4].
We'll wrap up the architecture choices before moving to tokenizers by building a small decision tree to determine weather to train a dense, a MoE or a Hybrid model.
@@ -1797,7 +1754,7 @@ Use when you're not memory-constrained and want maximum performance per compute.
Pros: Potentially better long-context handling. More efficient for very long sequences.
-Cons: Less mature with fewer proven training recipes. Limited framework support.
+Cons: Less mature than dense and MoE with fewer proven training recipes. Limited framework support.
@@ -1811,49 +1768,10 @@ We're also now seeing some teams explore diffusion models for text, but these mo
So to recap, start by asking where your model will be deployed. Then consider your team's expertise and your training timeline to assess how much exploration you can afford:
-```mermaid
-flowchart TD
- A["Where will this model run?"] --> B["Edge/Phones
- Memory-constrained environments"]
- A --> C["Other
- More memory available"]
-
- B --> D["Dense (most cases)
- Hybrid or other (for experienced teams)"]
-
- C --> E["What's your team's expertise?"]
-
- E --> F["First LLM training"]
- E --> G["Experienced
- Comfortable with dense"]
- E --> H["Very experienced"]
-
- F --> I["Dense
- (Focus on basics)"]
-
- G --> J["What's your timeline?"]
-
- J --> K["Tight
- Proven path required"]
- J --> L["Flexible
- Open to exploration"]
-
- K --> M["Dense"]
- L --> N["MoE or MoE + Hybrid: better perf/compute"]
-
- H --> O["MoE or MoE + Hybrid: better perf/compute"]
-
- classDef question fill:#ffd0c5,stroke:#ffb9b7,stroke-width:2px,color:#2d3748
- classDef decision fill:#f9f9f9,stroke:#e5e5e5,stroke-width:2px,color:#374151
- classDef success fill:#d1f2eb,stroke:#a3e4d7,stroke-width:2px,color:#0e5132
- classDef danger fill:#fef3c7,stroke:#fbbf24,stroke-width:2px,color:#78350f
- classDef category fill:#fef3c7,stroke:#fbbf24,stroke-width:2px,color:#78350f
-
- class A,E,J question
- class B,C,F,G,H,K,L decision
- class D,I,M success
- class N,O category
-```
+
+
+
+
For SmolLM3 we wanted to build a strong small model for on-device deployment, we had roughly a 3-month timeline, and have mostly trained dense models in the past. This ruled out MoE (memory constraints) and hybrid (short timeline to explore a new architecture, and dense models could get the long context we targeted of 128k tokens max), so we went for a dense model llama-style.
Now that we have studied the internals of the model architecture let's look at the tokenizer which forms the bridge between the data and our model.
@@ -2092,7 +2010,7 @@ Here's what we tested before launching the run that made the cut:
**Grouped Query Attention (GQA)** : We reconfirmed our earlier finding that GQA with 4 groups matches Multi-Head Attention performance, but this time at 3B scale with 100B tokens. The KV cache efficiency gains were too good to pass up, especially for on-device deployment where memory is precious.
- **NoPE for long context** : We implemented NoPE, by removing RoPE every 4th layer. Our 3B ablation confirmed the findings in the section above. NoPE improved long context handling without sacrificing short context performance.
+ **NoPE for long context** : We implemented NoPE, by removing RoPE every 4th layer. Our 3B ablation confirmed the findings in the section above. NoPE improved long context handling without sacrificing short context performance.
**Intra-document attention masking** : We prevent cross-document attention during training to help with training speed and stability when training on very large sequences, again we find that this doesn't impact downstream performance.
@@ -2253,7 +2171,7 @@ Many teams now use schedules where you don't need to start decaying immediately
-These schedules offer practical advantages over cosine decay. We can extend training mid-run without restarting, whether we want to train longer than initially planned, are decay early to get a more measure of training progress and we can run scaling law experiments across different token counts with one main training run. Moreover, studies show that both WSD and Multi-Step match cosine decay [@hägele2024scalinglawscomputeoptimaltraining, @deepseekai2024deepseekllmscalingopensource] while being more practical for real-world training scenarios.
+These schedules offer practical advantages over cosine decay. We can extend training mid-run without restarting, whether we want to train longer than initially planned, are decay early to get a more measure of training progress and we can run scaling law experiments across different token counts with one main training run. Moreover, studies show that both WSD and Multi-Step match cosine decay [@wsdhagele; @deepseekai2024deepseekllmscalingopensource] while being more practical for real-world training scenarios.
@@ -2262,7 +2180,7 @@ recently GLM 4.5 mentions that WSD perform worse on general benchmarks (SimpleQA
But you probably noticed that these schedules introduce new hyperparameters compared to cosine: How long should the decay phase last in WSD? And how long should each step be in the Multi-Step variant?
-- For WSD: The required cooldown duration to match cosine performance decreases with longer training runs, and it is recommended to allocate 10-20% of total tokens to the decay phase [@hägele2024scalinglawscomputeoptimaltraining]. We will confirm this setup matches cosine in our ablations below.
+- For WSD: The required cooldown duration to match cosine performance decreases with longer training runs, and it is recommended to allocate 10-20% of total tokens to the decay phase [@wsdhagele]. We will confirm this setup matches cosine in our ablations below.
- For Multi-Step: DeepSeek LLM's ablations found that while their baseline 80/10/10 split (stable until 80%, first step from 80-90%, second step from 90-100%) matches cosine, tweaking these proportions can even outperform it, for instance when using 70/15/15 and 60/20/20 splits.
But we can get even more creative with these schedules. Let's look at the schedules used in each family of the DeepSeek models:
@@ -2322,7 +2240,7 @@ charts: [
{ title: "OpenBookQA", metric: "openbookqa" },
{ title: "WinoGrande", metric: "winogrande" }
],
-smoothing: true,
+smoothing: false,
smoothingWindow: 5
}}
/>
@@ -2444,9 +2362,7 @@ Another interesting approach is treating the loss as a proxy for the critical ba
In practice, here's how you can choose the batch size and learning rate:
-- You first pick the batch size and learning rate you consider optimal, either from scaling laws (see later!) or
-
-from literature.
+- You first pick the batch size and learning rate you consider optimal, either from scaling laws (see later!) or from literature.
- Then, you can tune the batch size to see if you can improve the training throughput.
The key insight is that there's often a range between your starting batch size
@@ -2475,7 +2391,7 @@ The constant 6 comes from empirical estimates of how many floating-point operati
-If you want a more precise measure taking into account MoE layers and Hybrid layers you can checkout the [ `num_floating_point_operations` ](https://github.com/NVIDIA/Megatron-LM/blob/f34fa11af6f5dc65f5342f2a785c3227446cebfd/megatron/training/training.py#L158) function in Megatron-LM.
+If you want a more precise measure taking into account MoE layers and Hybrid layers you can checkout the [num_floating_point_operations](https://github.com/NVIDIA/Megatron-LM/blob/f34fa11af6f5dc65f5342f2a785c3227446cebfd/megatron/training/training.py#L158) function in Megatron-LM.
Now, how does this relate to learning rate? We can derive scaling laws that predict optimal learning rates and batch sizes as functions of total compute budget (C). They help answer questions like:
@@ -2585,11 +2501,11 @@ Additionally, across all of these sources and domains, there's often a subset of
To balance data across sources and make use of high-quality data, we need to carefully design the *mixture* : the relative proportion of training documents from each source. Since a language model's performance on some particular task or domain depends heavily on the amount of data it saw that is relevant to that task, tuning the mixing weights provides a direct way of balancing the model's capabilities across domains. Because these trade-offs are model-dependent and difficult to predict, ablations are essential.
-But the mixture doesn't have to stay fixed throughout training. By adjusting the mixture as training progresses, what we call **multi-stage training** [@smollm2] or curriculum, we can make better use of both high-quality and lower-quality data.
+But the mixture doesn't have to stay fixed throughout training. By adjusting the mixture as training progresses, what we call multi-stage training **** or curriculum, we can make better use of both high-quality and lower-quality data.
#### **The evolution of training curricula**
-In the early days of large language model training, the standard approach was to fix a single data mixture for the entire training run. Models like GPT3 and early versions of Llama trained on a static mixture from start to finish. More recently, the field has shifted toward **multi-stage training** where the data mixture changes over the course of training. The main motivation is that a language model's final behavior is strongly influenced by data seen toward the end of training [@chen2025scalinglawspredictingdownstream]. This insight enables a practical strategy: upweighting more plentiful sources early in training and mixing in smaller, higher quality sources towards the end.
+In the early days of large language model training, the standard approach was to fix a single data mixture for the entire training run. Models like GPT3 and early versions of Llama trained on a static mixture from start to finish. More recently, the field has shifted toward **multi-stage training** [@smollm2] where the data mixture changes over the course of training. The main motivation is that a language model's final behavior is strongly influenced by data seen toward the end of training [@chen2025scalinglawspredictingdownstream]. This insight enables a practical strategy: upweighting more plentiful sources early in training and mixing in smaller, higher quality sources towards the end.
A common question is: how do you decide when to change the mixture? While there's no universal rule, but we typically follow these principles:
@@ -2602,7 +2518,7 @@ Now that we've established why mixtures matter and how curricula work, let's dis
When testing data mixtures, our approach is similar to how we run architecture ablations, with one difference: we try to run them at the target model scale. Small and large models have different capacities, for example a very small model might struggle to handle many languages, while a larger one can absorb them without sacrificing performance elsewhere. Therefore running data ablations at too small a scale risks drawing the wrong conclusions about the optimal mix.
-For SmolLM3, we ran our main data ablations directly on the 3B model, using shorter training runs of 50B and 100B tokens. We also used another type of ablation setup: ***annealing experiments*** . Instead of training from scratch with different mixtures, we took an intermediate checkpoint from the main run (for example at 7T tokens) and continued training with modified data compositions. This approach, allows us to test data mixture changes for doing multi-stage training (i.e changing the training mixture mid-training), and was used in recent work such as SmolLM2, Llama3 and Olmo2. For evaluation, we expanded our benchmark suite to include multilingual tasks alongside our standard English evaluations, ensuring we could properly assess the trade-offs between different language ratios.
+For SmolLM3, we ran our main data ablations directly on the 3B model, using shorter training runs of 50B and 100B tokens. We also used another type of ablation setup: ****** **annealing experiments** . Instead of training from scratch with different mixtures, we took an intermediate checkpoint from the main run (for example at 7T tokens) and continued training with modified data compositions. This approach, allows us to test data mixture changes for doing multi-stage training (i.e changing the training mixture mid-training), and was used in recent work such as SmolLM2, Llama3 and Olmo2. For evaluation, we expanded our benchmark suite to include multilingual tasks alongside our standard English evaluations, ensuring we could properly assess the trade-offs between different language ratios.
@@ -2692,23 +2608,21 @@ You've made it this far, congrats! The real fun is about to begin.
At this point, we have everything in place: a validated architecture, a finalized data mixture, and tuned hyperparameters. The only thing left is setting up the infrastructure and hitting "train".
-For SmolLM3, we trained on 384 H100 GPUs (48 nodes) for nearly a month, processing 11 trillion tokens. This section walks you through what actually happens during a long training run: the pre-flight checks, the inevitable surprises, and how we kept things stable. You'll see firsthand why both solid ablation practices and reliable infrastructure matter. We cover the technical infrastructure details of GPU hardware, storage systems, and optimizing throughputs in the next chapter.
-
-It's a bit like setting off on a long road trip. We might have a perfect itinerary, but before we pull out of the driveway, we still check the car's engine, tires, and fuel. LLM training is the same, even with all the prep work, there's still a final round of sanity checks that can save us from unpleasant surprises mid-run.
+For SmolLM3, we trained on 384 H100 GPUs (48 nodes) for nearly a month, processing 11 trillion tokens. This section walks you through what actually happens during a long training run: the pre-flight checks, the inevitable surprises, and how we kept things stable. You'll see firsthand why both solid ablation practices and reliable infrastructure matter. We cover the technical infrastructure details of GPU hardware, storage systems, and optimizing throughputs in the [final chapter](#infrastructure---the-unsung-hero).
-Our team has been through this many times: from StarCoder and StarCoder2, to SmolLM, SmolLM2, and now SmolLM3. Every single run is different. Even if you've trained a dozen models, each new run finds a fresh way to surprise you. This section is about stacking the odds in your favor so you're ready for those surprises.
+Our team has been through this many times: from StarCoder and StarCoder2, to SmolLM, SmolLM2, and now SmolLM3. Every single run is different. Even if you've trained a dozen models, each new run finds a fresh way to surprise you. This section is about stacking the odds in your favour so you're ready for those surprises.
-### Pre-flight checklist: What to verify before hitting "train"
+### Pre-flight checklist: what to verify before hitting "train"
Before hitting "train", we go through a checklist to ensure everything works end-to-end:
**Infrastructure readiness:**
-- If your cluster supports reservations, use them. For SmolLM3, we had a fixed 48-node reservation for the entire run. That meant no queueing delays, consistent throughput, and the ability to track node health over time.
+- If your cluster supports Slurm reservations, use them. For SmolLM3, we had a fixed 48-node reservation for the entire run. That meant no queueing delays, consistent throughput, and the ability to track node health over time.
- Stress-test GPUs before launch (we use [GPU Fryer](https://github.com/huggingface/gpu-fryer) and [DCGM Diagnostics](https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html)) to catch throttling or performance degradation. For SmolLM3, we found two GPUs throttling and replaced them before starting the run.
- Avoid storage bloat: our system uploads each checkpoint to S3, then deletes the local copy right after saving the next one, so we never store more than one on the fast local GPU SSDs.
- **Evaluation setup:** Evaluations are deceptively time-consuming. Even with everything implemented, running them manually, logging results, and making plots can eat up hours each time. So try to automate them completely, and ensure they are running and logging correctly before the run starts. For SmolLM3, every saved checkpoint automatically triggered an evaluation job on the cluster that got logged to wandb/trackio.
+ **Evaluation setup:** Evaluations are deceptively time-consuming. Even with everything implemented, running them manually, logging results, and making plots can eat up hours each time. So try to automate them completely, and ensure they are running and logging correctly before the run starts. For SmolLM3, every saved checkpoint automatically triggered an evaluation job on the cluster that got logged to Wandb and [Trackio](https://github.com/gradio-app/trackio).
**Checkpoint & auto-resume system:** Verify that checkpoints are saved correctly and that the training job can resume from the latest one without manual intervention. On Slurm, we use `--requeue` option so a failed job gets automatically relaunched, resuming from the most recent checkpoint.
@@ -2723,7 +2637,7 @@ For detailed guidance on GPU testing, storage benchmarking, monitoring setup, an
### Scaling surprises
-After running extensive ablations for SmolLM3, we were ready for the full-scale run. Our 3B ablations on 100B tokens looked promising. The architectural changes compared to SmolLM2 (GQA, NoPE, document masking, tokenizer) either improved or maintained performance, and we found a good data mixture that balances English, multilingual, code, and math performance. We optimized our configuration for around 30% MFU on 384 GPUS (48 nodes).
+After running extensive ablations for SmolLM3, we were ready for the full-scale run. Our 3B ablations on 100B tokens looked promising. The architectural changes compared to SmolLM2 (detailed in [Architecture Choices](#architecture-choices): GQA, NoPE, document masking, tokenizer) either improved or maintained performance, and we found a good data mixture that balances English, multilingual, code, and math performance (see [The art of data curation](#smollm3-curating-the-data-mixture-web-multilingual-math-code). We optimized our configuration for around 30% MFU on 384 GPUS (48 nodes).
@@ -2759,9 +2673,11 @@ title: "Throughput Weka Drops"
This didn't happen in any ablation run, so what changed? Three things:
-1. Hardware can develop issues, GPUs that worked fine in ablations might fail and network connections might degrade under sustained load.
-1. We were now training on the full dataset instead of subsets: ~24 TB spanning dozens of web, code, math, and multilingual sources.
-1. We set the real step count for 11T tokens instead of the short 100B-token ablation horizon.
+1. Hardware state can change over time. GPUs that worked fine in ablations might fail and network connections might degrade under sustained load.
+1. The size of the training datasets. We now used the full ~24 TB training dataset instead of the smaller subsets from ablations, though the data sources themselves were the same.
+1. The number of training steps. We set the real step count for 11T tokens instead of the short 100B-token ablation horizon.
+
+Everything else remained exactly the same as in the throughput ablations: number of nodes, dataloader configuration, model layout, and parallelism setup...
Intuitively, neither dataset size nor step count should cause throughput drops, so we naturally suspected hardware issues first. We checked our node monitoring metrics, which showed that the big throughput jump correlated with spikes in disk read latency. That pointed us straight at our data storage.
@@ -2769,11 +2685,11 @@ Intuitively, neither dataset size nor step count should cause throughput drops,
Our cluster has three storage tiers for training data:
-- **FSx** : Network-attached storage which uses [Weka](https://www.weka.io/) a "keep-hot" caching model that stores frequently accessed files locally and evicts inactive "cold" files to S3 as capacity fills up.
+- **FSx** : Network-attached storage which uses [Weka](https://www.weka.io/), a "keep-hot" caching model that stores frequently accessed files locally and evicts inactive "cold" files to S3 as capacity fills up.
- **Scratch (Local NVMe RAID)** : Fast local storage on each node (8×3.5TB NVMe drives in RAID), which is faster than FSx but limited to local node access.
- **S3** : Remote object storage for cold data and backups.
-You can find more details in the Infrastructure chapter.
+You can find more details in the [Infrastructure chapter](#infrastructure---the-unsung-hero).
For SmolLM3's 24TB dataset, we initially stored the data in FSx (Weka). With 24TB of training data, on top of storage already used by several other teams, we were pushing Weka's storage to the limit. So it started evicting dataset shards mid-training, which meant we had to fetch them back, creating stalls, which explained the big throughput jump. Worse: there was no way to pin our dataset folders as hot for the full training.
@@ -2812,8 +2728,6 @@ Still suspecting hardware, we decided to test on fewer nodes. With 384 GPUs, the
Remember the three things that changed from our ablations? We had already addressed the data storage issue by moving to local node storage. Hardware was now eliminated. That left only one variable: the step count. We tested this by rolling back to smaller step counts (from 3M to 32k) and the thoughput drops became smaller! Larger step counts produced sharper, more frequent drops.
-Remember the three things that changed from our ablations? We had already addressed the data storage issue by moving to local node storage. Hardware was now eliminated. That left only one variable: the step count.
-
To test this, we ran identical configurations with only the training steps changed from 32k to 3.2M. You can see the [exact configs we used](https://huggingface.co/datasets/HuggingFaceTB/ablations-training-configs/tree/main/throughput_debugging):
```diff
@@ -2875,6 +2789,11 @@ We had two options:
With the time pressure to start the run and our cluster reservation running, we went with option #2 as the safer, faster fix. Tokenized data was already on each node, so reshuffling locally was cheap (~1 h). We also generated shuffled sequences for each epoch with different seeds to avoid repeating shuffling patterns across epochs.
+
+
+When facing urgent deadlines, it might be faster to adopt a proven solution or quick workaround than to debug your own broken implementation. Earlier, we plugged in TokenizedBytes dataloader rather than fixing nanosets' index implementation. Here, we chose offline pre-shuffling over dataloader changes. But know when to take shortcuts, or you'll end up with a patchwork system that's hard to maintain or optimize.
+
+
#### **Launch, Take Two**
By now we had:
@@ -2889,7 +2808,7 @@ We relaunched. This time, everything held. The loss curve was smooth, throughput
After fixing the throughput and dataloader issues, we launched the run again and trained smoothly for the first two days. Throughput was stable, loss curves looked as expected, and nothing in the logs suggested any problems. At around the 1T token mark, however, the evaluations revealed something unexpected.
-As part of our monitoring, we evaluate intermediate checkpoints and compare them to historical runs. For instance, we had the intermediate checkpoints from SmolLM2 (1.7B) trained on a similar recipe, so we could track how both models progressed at the same stages of training. The results were puzzling: despite having more parameters and a better data mixture, the 3B model was performing worse than the 1.7B at the same training point. Loss was still decreasing, and benchmark scores were improving, but the improvement rate was clearly below expectations.
+As part of our monitoring, we evaluate intermediate checkpoints and compare them to historical runs. For instance, we had the [intermediate checkpoints ](https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-intermediate-checkpoints)from SmolLM2 (1.7B) trained on a similar recipe, so we could track how both models progressed at the same stages of training. The results were puzzling: despite having more parameters and a better data mixture, the 3B model was performing worse than the 1.7B at the same training point. Loss was still decreasing, and benchmark scores were improving, but the improvement rate was clearly below expectations.
Given that we had thoroughly tested every architecture and data change introduced in SmolLM3 compared to SmolLM2, we validated the training framework and there were only a few remaining untested differences between the two training setups. The most obvious was tensor parallelism. SmolLM2 could fit on a single GPU and was trained without TP, while SmolLM3 required TP=2 to fit in memory. We didn't suspect it or think of testing it before, since TP was used in the 3B ablations and their results made sense.
@@ -2897,7 +2816,7 @@ Given that we had thoroughly tested every architecture and data change introduce
To test the TP bug hypothesis, we trained a 1.7B model with the exact same setup as SmolLM3 — same architecture changes (document masking, NoPE), same data mixture, same hyperparameters — both with and without TP. The difference was immediate: the TP version consistently had a higher loss and lower downstream performance than the non-TP version. That confirmed we were looking at a TP-related bug.
-We then examined the TP implementation in detail, comparing weights from TP and non-TP runs. The problem turned out to be subtle but significant: we were using identical random seeds across all TP ranks, when each rank should have been initialized with a different seed. This caused correlated weight initialization across shards, which affected convergence. The effect was not catastrophic — the model still trained and improved — but it introduced enough inefficiency to explain the gap we observed at scale.
+We then examined the TP implementation in detail, comparing weights from TP and non-TP runs. The problem turned out to be subtle but significant: we were using identical random seeds across all TP ranks, when each rank should have been initialised with a different seed. This caused correlated weight initialisation across shards, which affected convergence. The effect was not catastrophic — the model still trained and improved — but it introduced enough inefficiency to explain the gap we observed at scale.
Below is the bug fix:
```diff
@@ -2963,34 +2882,28 @@ With that, we had resolved the last in a series of unexpected issues that had su
### Staying the course
-As the previous section showed, scaling from ablations to full pretraining wasn't just "plug and play." unexpected challenges, it brought unexpected challenges, but we successfully identified and resolved each issue. This section covers the essential monitoring setup and considerations for large-scale training runs. We'll address critical questions: When should you restart training after encountering problems? How do you handle issues that surface deep into a run? Which metrics truly matter? Should you maintain a fixed data mixture throughout training?
+As the previous section showed, scaling from ablations to full pretraining wasn't just "plug and play." It brought unexpected challenges, but we successfully identified and resolved each issue. This section covers the essential monitoring setup and considerations for large-scale training runs. We'll address critical questions: when should you restart training after encountering problems? How do you handle issues that surface deep into a run? Which metrics truly matter? Should you maintain a fixed data mixture throughout training?
-#### Training monitoring: Beyond loss curves
+#### Training monitoring: beyond loss curves
The reason we caught the tensor-parallelism bug was not the loss curve, which looked fine, but the fact that downstream evaluations were lagging behind expectations. Additionally, having evaluations from SmolLM2's intermediate checkpoints was critical: they gave us a sanity check that the 3B model wasn't on the right track early. So if you're training large models, start running downstream evaluations early, and if you're comparing to an open-source model, ask whether the authors can provide intermediate checkpoints, those can be invaluable as reference points.
-On the infrastructure side, the most important metric is throughput, measured in tokens per second. For SmolLM3, we expected stable throughput between 13,500–14,000 tokens/sec across the run, and any sustained deviation was a red flag. But throughput alone is not enough: you also need continuous hardware health monitoring to anticipate and detect hardware failures. Some of the key metrics we tracked included: GPU temperatures, memory usage and compute utilization. We log them into Grafana dashboards and set up real-time Slack alerts for hardware anomalies.
+On the infrastructure side, the most important metric is throughput, measured in tokens per second. For SmolLM3, we expected stable throughput between 13,500–14,000 tokens/sec across the run, and any sustained deviation was a red flag. But throughput alone is not enough: you also need continuous hardware health monitoring to anticipate and detect hardware failures. Some of the key metrics we tracked included: GPU temperatures, memory usage and compute utilisation. We log them into Grafana dashboards and set up real-time Slack alerts for hardware anomalies.
#### Fix and restart vs fix on the fly
Given that we restarted our run after 1T tokens, an important question arises: do you always need to restart when something goes wrong? The answer depends on the severity and root cause of the issue.
-In our case, the TP seeding bug meant we were starting on the wrong foot, half our weights weren't properly initialized. The model was showing performance similar to SmolLM2 and plateauing at similar points, meaning we'd likely end up with a model that performed the same but cost almost twice as much to train. Restarting made sense. However, many issues can be course-corrected mid-run to avoid wasting compute. The most common issue involves loss spikes, those sudden jumps in training loss that can either signal minor hiccups or divergence.
+In our case, the TP seeding bug meant we were starting on the wrong foot, half our weights weren't properly initialised. The model was showing performance similar to SmolLM2 and plateauing at similar points, meaning we'd likely end up with a model that performed the same but cost almost twice as much to train. Restarting made sense. However, many issues can be course-corrected mid-run to avoid wasting compute. The most common issue involves *loss spikes* , those sudden jumps in training loss that can either signal minor hiccups or divergence.
As [Stas Bekman](https://media.istockphoto.com/id/486869012/fr/photo/ch%C3%A8vre-est-%C3%A0-nous.jpg?s=612x612&w=0&k=20&c=F26PCPZiy1P3FLZS23GWhKcQ8Buqfx8StHYoX85hq-s%3D) nicely puts it in the [Machine Learning Engineering Open Book ](https://github.com/stas00/ml-engineering/blob/master/training/instabilities/training-loss-patterns.md)"Training loss plots are similar to heartbeat patterns—there's the good, the bad, and the you-should-worry ones."
-
-
+
Loss spikes fall into two categories:
@@ -3002,8 +2915,8 @@ While we don't fully understand training instabilities, we know they become more
- High learning rates: These cause instability early in training and can be fixed by reducing the learning rate.
- Bad data: Usually the main cause of recoverable spikes, though recovery may be slow. This can happen deep into training when the model encounters low-quality data.
-- Data-parameter state interactions: PaLM observed that spikes often result from specific combinations of data batches and model parameter states, rather than "bad data" alone. Training on the same problematic batches from a different checkpoint didn't reproduce the spikes.
-- Poor initialization: Recent work by OLMo2 showed that switching from scaled initialization to a simple normal distribution (mean=0, std=0.02) improved stability.
+- Data-parameter state interactions: PaLM [@palm] observed that spikes often result from specific combinations of data batches and model parameter states, rather than "bad data" alone. Training on the same problematic batches from a different checkpoint didn't reproduce the spikes.
+- Poor initialisation: Recent work by OLMo2 [@olmo2] showed that switching from scaled initialisation to a simple normal distribution (mean=0, std=0.02) improved stability.
- Precision issues: While no one trains with FP16 anymore, [BLOOM](https://arxiv.org/abs/2211.05100) found it highly unstable compared to BF16.
**Before spikes happen, build in stability:**
@@ -3012,9 +2925,9 @@ Small models with conservative learning rates and good data rarely spike, but la
Data filtering and shuffling: By this point in the blog, you've noticed how often we circle back to data. Making sure your data is clean and well-shuffled can prevent spikes. For instance, OLMo2 found that removing documents with repeated n-grams (32+ repetitions of 1-13 token spans) significantly reduced spike frequency.
-Training modifications: Z-loss regularization keeps output logits from growing too large without affecting performance. And excluding embeddings from weight decay also helps.
+Training modifications: Z-loss regularisation keeps output logits from growing too large without affecting performance. And excluding embeddings from weight decay also helps.
-Architectural changes: QKNorm (normalizing query and key projections before attention) has proven effective. OLMo2 and other teams found it helps with stability, and interestingly, [Marin team](https://wandb.ai/marin-community/marin/reports/Marin-32B-Work-In-Progress--VmlldzoxMzM1Mzk1NQ) found that it can even be applied mid-run to fix divergence issues.
+Architectural changes: QKNorm (normalising query and key projections before attention) has proven effective. OLMo2 and other teams found it helps with stability, and interestingly, [Marin team](https://wandb.ai/marin-community/marin/reports/Marin-32B-Work-In-Progress--VmlldzoxMzM1Mzk1NQ) found that it can even be applied mid-run to fix divergence issues.
**When spikes happen anyway - damage control:**
@@ -3024,10 +2937,11 @@ Even with these precautions, spikes can still occur. Here are some options for f
- **Tighten gradient clipping** : Reduce the gradient norm threshold temporarily
- **Apply architectural fixes** such as QKnorm, as done in Marin.
+We've walked through the scaling challenges, from throughput drops to the TP bug, the monitoring practices to catch problems early, and strategies for preventing and fixing loss spikes. Let's finish this chapter by discussing how multi-stage training can enhance your model's final performance.
+
### Mid-training
-Modern LLM pretraining typically involves multiple stages with different data mixtures, often followed by a final phase to extend context length. For example, Qwen3 uses a three-stage approach: a general stage on 30T tokens at 4k context, a reasoning stage with 5T higher-quality tokens emphasizing STEM and coding, and finally a long context stage on hundreds of billions of tokens at 32k context length. SmolLM3 follows a similar philosophy, with planned interventions to introduce higher-quality datasets and extend context, alongside reactive adjustments based on performance monitoring.
-As we explained in the data curation section, some interventions are planned from the start: for SmolLM3, we knew we'd introduce higher-quality math (FineMath4+) and code (Stack-Edu) in Stage 2, then add curated Q&A and reasoning math and code data during the final decay phase. Other interventions are reactive, driven by monitoring performance during training. For example, in SmolLM2, when we found math and code performance lagging behind our targets, we responded by curating entirely new datasets (FineMath and Stack-Edu) and introducing them mid-training.
+Modern LLM pretraining typically involves multiple stages with different data mixtures, often followed by a final phase to extend context length. For example, Qwen3 [@qwen3] uses a three-stage approach: a general stage on 30T tokens at 4k context, a reasoning stage with 5T higher-quality tokens emphasising STEM and coding, and finally a long context stage on hundreds of billions of tokens at 32k context length. SmolLM3 follows a similar philosophy, with planned interventions to introduce higher-quality datasets and extend context, alongside reactive adjustments based on performance monitoring.
As we explained in the data curation section, the data mixture doesn't have to stay fixed throughout training. Multi-stage training allows us to strategically shift dataset proportions as training progresses. Some interventions are planned from the start: for SmolLM3, we knew we'd introduce higher-quality FineMath4+ and Stack-Edu in Stage 2, then add curated Q&A and reasoning data during the final decay phase. Other interventions are reactive, driven by monitoring performance during training. For example, in SmolLM2, when we found math and code performance lagging behind our targets, we curated entirely new datasets (FineMath and Stack-Edu) and introduced them mid-training. This flexibility—whether following a planned curriculum or adapting to emerging gaps—is what allows us to maximize the value of our compute budget.
@@ -3053,7 +2967,7 @@ During the learning rate decay phase, we further upsample high-quality code and
#### Long context extension: From 4k to 128k tokens
-Context length determines how much text your model can process, it's crucial for tasks like analyzing long documents, maintaining coherent multi-turn conversations, or processing entire codebases. SmolLM3 started training at 4k tokens, but we needed to scale to 128k for real-world applications.
+Context length determines how much text your model can process, it's crucial for tasks like analysing long documents, maintaining coherent multi-turn conversations, or processing entire codebases. SmolLM3 started training at 4k tokens, but we needed to scale to 128k for real-world applications.
**Why extend context mid-training?**
@@ -3065,31 +2979,35 @@ We didn't jump straight to 128k. Instead, we gradually extended context in stage
-During the long context ablations, we found [HELMET](https://arxiv.org/abs/2410.02694) benchmark to be very noisy on base models
-(the same training with different seeds gives variable results). [Gao et al.](https://arxiv.org/abs/2410.02660)
-recommend doing SFT on top to reduce variance on the benchmarks' tasks.
-Instead we go for Ruler, which we found to give more reliable signal at the base
-model level.
+During the long context ablations, we found [HELMET](https://arxiv.org/abs/2410.02694) benchmark to be very noisy on base models (the same training with different seeds gives variable results). Gao et al. recommend doing SFT on top to reduce variance on the benchmarks' tasks.
+Instead we opted for RULER, which we found to give more reliable signal at the base model level.
-During this phase, it's common to upsample long context documents such as lengthy web pages and books to improve long context performance [@prolong; qwen3]. We ran several ablations upsampling books, articles, and even synthetically generated documents for tasks like retrieval and fill-in-the-middle, following Qwen2.5-1M's approach [@qwen1Million] with FineWeb-Edu and Python-Edu. Surprisingly, we didn't observe any improvement over just using the baseline mixture from Stage 3, which was already competitive with other state-of-the-art models like Llama 3.2 3B and Qwen2.5 3B on Ruler. We hypothesize this is because the baseline mixture naturally includes long documents from web data and code (estimated at 10% of tokens), and that using NoPE helped.
+During this phase, it's common to upsample long context documents such as lengthy web pages and books to improve long context performance [@prolong; qwen3]. We ran several ablations upsampling books, articles, and even synthetically generated documents for tasks like retrieval and fill-in-the-middle, following Qwen2.5-1M's approach [@qwen1Million] with FineWeb-Edu and Python-Edu. Surprisingly, we didn't observe any improvement over just using the baseline mixture from Stage 3, which was already competitive with other state-of-the-art models like Llama 3.2 3B and Qwen2.5 3B on Ruler. We hypothesise this is because the baseline mixture naturally includes long documents from web data and code (estimated at 10% of tokens), and that using NoPE helped.
-For more insights into long context extension, we recommend reading the paper [How to Train Long-Context Language Models (Effectively)](https://arxiv.org/abs/2410.02660)
+For more insights into long context extension, we recommend reading the paper How to Train Long-Context Language Models (Effectively)
- **RoPE ABF (RoPE with Adjusted Base Frequency):** When going from 4k to 32k, we increased RoPE theta (base frequency) to 2M, and to go from 32k to 64k, we increased it to 5M. We found that using larger values like 10M slightly improves Ruler score but hurts some short context task like GSM8k, so we kept 5M which didn't impact short context.
+ **RoPE ABF (RoPE with Adjusted Base Frequency):** When going from 4k to 32k, we increased RoPE theta (base frequency) to 2M, and to go from 32k to 64k, we increased it to 5M. We found that using larger values like 10M slightly improves RULER score but hurts some short context task like GSM8k, so we kept 5M which didn't impact short context.
During this context extension phase, we also used the opportunity to further upsampled math, code, and reasoning Q&A data, and we added few hundred thousand samples in ChatML format.
+
+
+We also experimented with sliding window attention (window sizes of 4k, 8k, and 16k) during the 4k→32k extension, but found it performed worse on RULER compared to full attention.
+
+
**YARN extrapolation: Reaching 128k**
-Even after training on 64k contexts, we wanted SmolLM3 to handle 128k at inference. Rather than training on 128k sequences (prohibitively expensive), we used YARN (Yet Another RoPE extensioN method) [@yarn], which allows the model to extrapolate beyond its training length. In theory, YARN allows a four-fold increase in sequence length. We found that using the 64k checkpoint gave better performance at 128k than using the 32k checkpoint, confirming the benefit of training closer to the target inference length. However, pushing to 256k (four-fold from 64k) showed degraded Ruler performance, so we recommend using the model up to 128k.
+Even after training on 64k contexts, we wanted SmolLM3 to handle 128k at inference. Rather than training on 128k sequences (prohibitively expensive), we used YARN (Yet Another RoPE extensioN method) [@yarn], which allows the model to extrapolate beyond its training length. In theory, YARN allows a four-fold increase in sequence length. We found that using the 64k checkpoint gave better performance at 128k than using the 32k checkpoint, confirming the benefit of training closer to the target inference length. However, pushing to 256k (four-fold from 64k) showed degraded Ruler performance, so we recommend using the model up to 128k.
+
+And with that, we've walked through the full pretraining journey for SmolLM3, from planning and ablations to the final training run, with all the behind-the-scenes challenges along the way.
### Wrapping up pretraining
-We've covered a lot of ground. From the Training Compass that helped us decide why and what to train, through strategic planning, systematic ablations that validated every architectural choice, to the actual training marathon where surprises emerged at scale (throughput mysteriously collapsing, dataloader bottlenecks, and a subtle tensor parallelism bug that forced a restart at 1T tokens).
+We've covered a lot of ground. From the training compass that helped us decide why and what to train, through strategic planning, systematic ablations that validated every architectural choice, to the actual training marathon where surprises emerged at scale (throughput mysteriously collapsing, dataloader bottlenecks, and a subtle tensor parallelism bug that forced a restart at 1T tokens).
-The messy reality behind those polished technical reports is now visible: **training LLMs is as much about disciplined experimentation and rapid debugging as it is about architectural innovations.** Planning identifies what's worth testing. Ablations validate each decision. Monitoring catches problems early. And when things inevitably break, systematic derisking tells you exactly where to look.
+The messy reality behind those polished technical reports is now visible: **training LLMs is as much about disciplined experimentation and rapid debugging as it is about architectural innovations and data curation.** Planning identifies what's worth testing. Ablations validate each decision. Monitoring catches problems early. And when things inevitably break, systematic derisking tells you exactly where to look.
For SmolLM3 specifically, this process delivered what we set out to build: a 3B model trained on 11T tokens that's competitive on math, code, multilingual understanding, and long-context tasks, in the Pareto frontier of Qwen3 models.
@@ -3111,38 +3029,9 @@ This is where post-training comes in. And just like pretraining, the reality is
Once the pre-training finishes we should have an SFT baseline within a day
-
+
+
-```mermaid
-flowchart TD
- %% Main algorithms
- Base["Base model"]
- SFT["SFT"]
- ORPO["ORPO"]
- DPOAPO["DPO and friends"]
- RL["RL (here be dragons)"]
- KTO["KTO"]
-
- Base --> SFT
- Base --> ORPO
- Base --> KTO
- Base --> RL
-
- SFT --> ORPO
- SFT --> DPOAPO
- SFT --> RL
- SFT --> KTO
-
- classDef baseModel fill:#f9f9f9
- classDef sft fill:#ffd0c5
- classDef algorithm fill:#fef3c7
- classDef note fill:#f5f5f5
-
- class Base baseModel
- class SFT sft
- class ORPO,DPOAPO,RL,KTO algorithm
-```
-
Pre-training gave us SmolLM3's raw ability, but before the GPUs are cooled down we enter the next frontier of model capabilities: *post-training* . This includes supervised fine-tuning, reinforcement learning, model merging, and more — all designed to bridge the gap between "a model that predicts text" to "a model people can actually use". If pre-training is about brute-forcing knowledge into weights, post-training is about sculpting that raw capability into something reliable and steerable. And just like pre-training, the polished post-training papers don't capture the late-night surprises: GPU meltdowns, finicky data mixtures, or the way a seemingly minor chat template decision can ripple through downstream benchmarks. In this section, we'll show how we navigated the messy world of post-training to turn SmolLM3 from a strong base model into a state-of-the-art hybrid reasoner.
@@ -3179,7 +3068,7 @@ Let's walk through how we answered these questions for SmolLM3:
- **Why?** For us, the "why" was straightforward as we had a base model that needed post-training before release. At the same time, hybrid reasoning models like Qwen3 were becoming increasingly popular, yet open recipes showing how to train them were scarce. SmolLM3 gave us an opportunity to address both: prepare a model for real-world use and contribute a fully open recipe to sit on the Pareto front alongside Qwen3's 1.7B and 4B models.
- **What?** We set out to train a hybrid reasoning model that was tailored to SmolLM3's strengths, chiefly that reasoning quality should hold up across languages other than English. And since real-world use increasingly involves tool calling and long-context workflows, those became core requirements in our post-training recipe.
-- **How?** that's the rest of this chapter 😀.
+- **How?** That's the rest of this chapter 😀.
Just like with pre-training, we start with the fundamentals: evals and baselines, because every big model starts with a small ablation. But there's a key difference in how we ablate. In pre-training, "small" usually means smaller models and datasets. In post-training, "small" means smaller datasets and *simpler* *algorithms* . We almost never use a different base model for ablations because behaviour is too model-dependent, and runs are short enough to iterate on the target model directly.
@@ -3208,10 +3097,10 @@ It remains an interesting, yet open question whether tiny models can use tool ca
At Hugging Face, we use a layered eval suite, echoing the pre-training principles (monotonicity, low noise, above-random signal, ranking consistency) that we detailed in the [ablations section](#every-big-model-starts-with-a-small-ablation) for pretraining.
-
+
-The list of evals to consider is continuously evolving and the ones discussed below reflect our focus in mid 2025. See the [Evaluation Guidebook](https://github.com/huggingface/evaluation-guidebook/blob/main/yearly_dives/2025-evaluations-for-useful-models.md) for a comprehensive overview of post-training evals.
-
+The list of evals to consider is continuously evolving as models improve and the ones discussed below reflect our focus in mid 2025. See the [Evaluation Guidebook](https://github.com/huggingface/evaluation-guidebook/blob/main/yearly_dives/2025-evaluations-for-useful-models.md) for a comprehensive overview of post-training evals.
+
Here are the many ways one can evaluate a post trained model:
@@ -3315,7 +3204,7 @@ With the evals at hand, it's time to train some models! Before doing that, we fi
### Tools of the trade
-Behind every post-training recipe lies a toolbox of frameworks and libraries that enable large-scale experimentation. Each frameworks brings its own set of supported algorithms, fine-tuning methods, and scalability features. The table summarises the main areas of support, from supervised fine-tuning (SFT) to preference optimisation (PO) and reinforcement learning (RL):
+Behind every post-training recipe lies a toolbox of frameworks and libraries that enable large-scale experimentation. Each frameworks brings its own set of supported algorithms, fine-tuning methods, and scalability features. The table below summarises the main areas of support, from supervised fine-tuning (SFT) to preference optimisation (PO) and reinforcement learning (RL):
| Framework | SFT | PO | RL | Multi-modal | FullFT | LoRA | Distributed |
| --- | --- | --- | --- | --- | --- | --- | --- |
@@ -3328,8 +3217,10 @@ Behind every post-training recipe lies a toolbox of frameworks and libraries tha
| [ **PipelineRL** ](https://github.com/ServiceNow/PipelineRL) | ❌ | ❌ | ✅ | ❌ | ✅ | ✅ | ✅ |
| [ **ART** ](https://github.com/OpenPipe/ART/tree/main) | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ |
| [ **TorchForge** ](https://github.com/meta-pytorch/torchforge) | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ |
+| [ **NemoRL** ](https://github.com/NVIDIA-NeMo/RL) | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ |
+| [ **OpenRLHF** ](https://github.com/OpenRLHF/OpenRLHF) | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ |
-Here *FullFT* refers to **full fine-tuning** , where all model parameters are updated during training. *LoRA* stands for **Low-Rank Adaptation** , a parameter-efficient approach that updates only small low-rank matrices while keeping the base model frozen. Multi-modal refers to whether support for training on modalities beyond text (e.g. images) is supported and Distributed indicates whether training models on more than one GPU is possible.
+Here *FullFT* refers to **full fine-tuning** , where all model parameters are updated during training. *LoRA* stands for **Low-Rank Adaptation** , a parameter-efficient approach that updates only small low-rank matrices while keeping the base model frozen. *Multi-modal* refers to whether support for training on modalities beyond text (e.g. images) is supported and *Distributed* indicates whether training models on more than one GPU is possible.
At Hugging Face, we develop and maintain TRL, so it's our framework of choice and the one we used to post-train SmolLM3.
@@ -3344,11 +3235,6 @@ There is a class of researchers that love to bemoan the use of training framewor
But this position ignores the reality of modern research and production. Take RL for example. Algorithms like PPO and GRPO are notoriously tricky to implement correctly [[@ndetailsrlhf](https://arxiv.org/abs/2403.17031)], and tiny mistakes in normalisation or KL penalties can lead to days of wasted compute and effort.
-
-
-Even for SFT, it is [tricky to implement gradient accumulation correctly](https://github.com/karpathy/nanochat/pull/59) in distributed settings.
-
-
Similarly, although it's tempting to write a single-file implementation of some algorithm, can that same script scale from 1B to 100B+ parameters?
Frameworks exist precisely because the basics are already well-understood and endlessly reinventing them is a poor use of time. That's not to say there's no value in low-level tinkering. Implementing PPO from scratch once is an excellent learning exercise. Writing a toy transformer without a framework teaches you how attention really works. But in most cases, just pick a framework you like and hack it for your purposes.
@@ -3361,7 +3247,7 @@ If you spend any time on X these days, you'd think reinforcement learning (RL) i
-As we'll see later in this blog post, RL really does work, but comes with practical tradeoffs we discuss below.
+As we'll see later in this chapter, RL really does work, but comes with practical tradeoffs we discuss below.
RL isn't new of course. OpenAI and other labs relied heavily on RL from human feedback (RLHF) [[@rlhf](https://huggingface.co/blog/rlhf)] to align their early models, but it wasn't until the release of DeepSeek-R1 [[@deepseekr1](https://huggingface.co/papers/2501.12948)] that RL-based post-training really caught on in the open-source ecosystem.
@@ -3396,7 +3282,7 @@ When choosing a base model for post-training, a few practical dimensions matter
-The [LocalLLaMa subreddit](https://www.reddit.com/r/LocalLLaMA/) is a great place to understand the broad vibes of new models. [Artificial Analysis](https://artificialanalysis.ai/) and [LMArena](https://lmarena.ai/) also provide independent evaluation of new models, although these platforms are sometimes [benchmaxxed by model providers.](https://huggingface.co/papers/2504.20879)
+The LocalLLaMa subreddit is a great place to understand the broad vibes of new models. Artificial Analysis and LMArena also provide independent evaluation of new models, although these platforms are sometimes [benchmaxxed by model providers.](https://huggingface.co/papers/2504.20879)
In our experience, the base models from Qwen, Mistral, and DeepSeek are the most amenable to post-training, with Qwen being a clear favourite since each model series typically covers a large parameter range (e.g. Qwen3 models range in size from 0.6B to 235B!). This feature makes scaling far more straightforward.
@@ -3461,7 +3347,7 @@ The table below shows a few popular chat templates and how they compare across t
| GPT-OSS | ✅ | ✅ | ✅ | ✅ | Based on the [Harmony response format](https://cookbook.openai.com/articles/openai-harmony). Complex, yet versatile. |
-In most cases, we've found that ChatML or Qwen's chat templates are an excellent place to start. For SmolLM3, we needed a template for hybrid reasoning and found that Qwen3 was one of the few templates that struck a good balance across the dimensions we cared about. However, it had one quirk that we weren't entirely happy with: the reasoning content is discarded for all but the final turn in a conversation. As shown in the figure below, this is similar to how [OpenAI's reasoning models work](https://platform.openai.com/docs/guides/reasoning/how-reasoning-works):
+In most cases, we've found that ChatML or Qwen's chat templates are an excellent place to start. For SmolLM3, we needed a template for hybrid reasoning and found that Qwen3 was one of the few templates that struck a good balance across the dimensions we cared about. However, it had one quirk that we weren't entirely happy with: the reasoning content is *discarded* for all but the final turn in a conversation. As shown in the figure below, this is similar to how [OpenAI's reasoning models work](https://platform.openai.com/docs/guides/reasoning/how-reasoning-works):
@@ -3547,23 +3433,12 @@ Before we dive into optimisation and squeezing every point of performance, we ne
When it comes to training SFT baselines, here's the main things to consider:
-- Will you use full fine-tuning (FullFT) or parameter efficient methods like LoRA or QLoRA?
-
-
-
-See the wonderful [blog post](https://thinkingmachines.ai/blog/lora/) by Thinking Machines on when these methods produce equivalent performance.
-
-
-- What type of parallelism do you need? For small models or those trained with LoRA, you can usually get by with data parallel. For larger models you will need FSDP2 or DeepSpeed ZeRO-3 to shared the model weights and optimizer states. For models trained with long context, use methods like [context parallelism](https://huggingface.co/docs/trl/v0.23.0/en/reducing_memory_usage#context-parallelism).
+- Will you use full fine-tuning (FullFT) or parameter efficient methods like LoRA or QLoRA? As described in the wonderful [blog post](https://thinkingmachines.ai/blog/lora/) by Thinking Machines, LoRA can match FullFT under certain conditions (usually determined by the size of the dataset).
+- What type of parallelism do you need? For small models or those trained with LoRA, you can usually get by with data parallel. For larger models you will need FSDP2 or DeepSpeed ZeRO-3 to shared the model weights and optimiser states. For models trained with long context, use methods like [context parallelism](https://huggingface.co/docs/trl/v0.23.0/en/reducing_memory_usage#context-parallelism).
- Use kernels like FlashAttention and Liger if your hardware supports them. Many of these kernels are hosted on the [Hugging Face Hub](https://huggingface.co/models?other=kernel) and can be set via a [simple argument](https://huggingface.co/docs/trl/kernels_hub) in TRL to dramatically lower the VRAM usage.
- Mask the loss to [train only on assistant tokens](https://huggingface.co/docs/trl/sft_trainer#train-on-assistant-messages-only). As we discuss below, this can be achieved by wrapping the assistant turns of your chat template with a special `{% generation %}` keyword.
- Tune the learning rate; aside from the data, this is the most important factor that determines whether your model is "meh" vs "great".
-- [Pack the training samples](https://huggingface.co/docs/trl/v0.23.0/en/reducing_memory_usage#packing) and tune the sequence length to match the distribution of your data. This will dramatically speed up training.
-
-
-
-TRL has a handy [application](https://huggingface.co/docs/trl/v0.23.0/en/reducing_memory_usage#how-to-choose-the-maxlength-value) to do this for you.
-
+- [Pack the training samples](https://huggingface.co/docs/trl/v0.23.0/en/reducing_memory_usage#packing) and tune the sequence length to match the distribution of your data. This will dramatically speed up training. TRL has a handy [application](https://huggingface.co/docs/trl/v0.23.0/en/reducing_memory_usage#how-to-choose-the-maxlength-value) to do this for you.
Let's look at how some of these choices panned out for SmolLM3. For our first baseline experiments, we wanted a simple sanity check: does the chat template actually elicit hybrid reasoning? To test this, we compared three data mixtures from our [table](#sft-datasets):
@@ -3704,6 +3579,11 @@ To measure this quantitatively for SmolLM3, we took inspiration from Qwen3, who
To fix this capability, we constructed a new dataset called IFThink. Based on the Multi-IF pipeline, we used single-turn instructions from [Tulu 3's instruction-following subset](https://huggingface.co/datasets/allenai/tulu-3-sft-personas-instruction-following) and expanded them into multi-turn exchanges using Qwen3-32B to both generate both verifiable instructions and reasoning traces. The method is illustrated below:
+
+
+We considered filtering out conflicting instructions, but the initial results were strong enough to skip this step.
+
+
```mermaid
flowchart TD
%% Inputs
@@ -3752,11 +3632,6 @@ flowchart TD
```
Including this data in our baseline mix produced a dramatic improvement:
-
-
-We considered filtering out conflicting instructions, but the initial results were strong enough to skip this step.
-
-
@@ -3918,11 +3793,6 @@ Another important component we mentioned in the pre-training section is the opti
Continued pretraining—or mid-training if you want to sound fancy—means taking a base model and training it further on large amounts of domain-specific tokens before doing SFT. Mid-training is useful when your target capabilities for SFT share a common core skill, such as coding or reasoning. In practice, this shifts the model toward a distribution that better supports reasoning, a specific language, or any other capability you care about. Starting SFT from a model that has already integrated that core skill allows your model to better focus on the specific topics in your SFT data rather than using compute to learn the core skill from scratch.
-
-
-You might wonder why we're discussing continued pretraining *after* we did some SFT runs. Chronologically, mid-training happens before SFT on the base model. But the decision to do mid-training only becomes clear after you've run initial SFT experiments and identified performance gaps. In practice, you'll often iterate: run SFT to identify weak areas, then do targeted mid-training, then run SFT again. Think of this section as "what to do when SFT alone isn't enough."
-
-
The mid-training approach traces back to ULMFit [[@ulmfit](https://arxiv.org/abs/1801.06146)], which pioneered the three-stage pipeline of general pretraining → mid-training → post-training that is now common in modern LLMs like FAIR's Code World Model [[@cwm](https://huggingface.co/papers/2510.02387)]:
@@ -3950,6 +3820,11 @@ These results prompted us to try a similar approach. From our prior experience w
Since we planned to include reasoning data in the final SFT mix, we decided to keep Mixture of Thoughts for that stage the others for mid-training. We used ChatML as the chat template to avoid "burning in" the SmolLM3 one too early on. We also trained for 5 epochs with a learning rate of 2e-5, using 8 nodes to accelerate training with an effective batch size of 128.
+
+
+You might wonder why we're discussing mid-training *after* we did some SFT runs. Chronologically, mid-training happens before SFT on the base model. But the decision to do mid-training only becomes clear after you've run initial SFT experiments and identified performance gaps. In practice, you'll often iterate: run SFT to identify weak areas, then do targeted mid-training, then run SFT again. Think of this section as "what to do when SFT alone isn't enough."
+
+
**The mystery of the melting GPUs**
Running these experiments turned out to be a surprising challenge on our cluster: the aging GPUs would get throttled at various points which would lead to hardware failures and forced restarts of each run. To give you a taste of what it was like, here's the logs from one of the runs, where each colour represents a restart:
@@ -3972,7 +3847,7 @@ As we later discovered, a bug with DP in Accelerate meant that the weights and g
To prevent this, most accelerators use FP32 for the "master weights" and optimiser states, and only cast back to BF16 for the forward and backward passes.
- So we switched back to DeepSpeed and added aggressive checkpointing to minimise the time lost from GPUs overheating and "falling off the bus". This strategy proved successful and is something we recommend more generally:
+So we switched back to DeepSpeed and added aggressive checkpointing to minimise the time lost from GPUs overheating and "falling off the bus". This strategy proved successful and is something we recommend more generally:
@@ -4008,19 +3883,14 @@ Although you can keep scaling SFT with more data, at some point you'll observe d
-The problem persists even if your dataset contains an even mix of traces (i.e. some that reach the correct solution immediately, and others where the model first makes a mistake and corrects it). In this case, the model may simply learn that making an initial error is part of the desired pattern: half the time it should start wrong and then correct itself. What we actually want, of course, is a model that aims to produce the correct solution from the start.
+The problem persists even if your dataset contains an even mix of traces (i.e. some that reach the correct solution immediately, and others where the model first makes a mistake and corrects it). In this case, the model may simply learn that making an initial error is part of the desired pattern. What we actually want, of course, is a model that can produce the correct solution from the start.
This is where preference optimisation comes in. Instead of just copying demonstrations, we give the model comparative feedback like "response A is better than response B". These preferences provide a more direct training signal for quality and enable to model performance to scale beyond the limits of SFT alone.
Another benefit of preference optimisation is that you typically need far less data than SFT, since the starting point is already a pretty good model that can follow instructions and has knowledge from previous training stages.
-
-
-As we'll see below, there are some algorithms like [ORPO](https://arxiv.org/abs/2403.07691) which can be applied directly to base models.
-
-
- Let's take a look at how these datasets are created.
+Let's take a look at how these datasets are created.
#### Creating preference datasets
@@ -4028,13 +3898,13 @@ Historically, preference datasets were created by providing human annotators wit
**Strong vs. weak**
-1. Take a fixed set of prompts (often curated for coverage and difficulty).
-1. Generate one response from a weaker or baseline model, and another from a stronger or more advanced model.
-1. Label the stronger model's output as the preferred response.
+1. Take a fixed set of prompts $x$ (often curated for coverage and difficulty).
+1. Generate one response from a weaker or baseline model, and another from a high-performing model.
+1. Label the stronger model's output as the chosen response $y_c$ and the weaker one as rejected $y_r$ .
-This produces a dataset of "stronger vs. weaker" comparisons, which is simple to construct because we assume the stronger model's output is reliably better.
+This produces a dataset of "stronger vs. weaker" comparisons $(\lbrace{x,y_c,y_r\rbrace})$ , which is simple to construct because we assume the stronger model's output is reliably better.
-Below is a popular example from Intel, which took an SFT dataset with responses from gpt-3.5 and gpt-4 and converted it into a preference dataset by selecting the gpt-4 responses as "chosen" and the gpt-3.5 ones as "rejected":
+Below is a popular example from Intel, who took an SFT dataset with responses from gpt-3.5 and gpt-4 and converted it into a preference dataset by selecting the gpt-4 responses as chosen and the gpt-3.5 ones as rejected:
- **On-policy with evaluation**
+ **On-policy with grading**
-1. Use the *same model* you are training to generate multiple candidate responses to the same prompt. This creates data that is "on-policy" because it reflects the distribution of outputs the model would naturally produce.
-1. Instead of relying on a stronger model as the reference, introduce an *external evaluator:* either a verifier or a reward model that scores responses along one or more quality axes (e.g., helpfulness or factual accuracy).
-1. The evaluator then assigns preference labels among the candidate responses, producing a more nuanced and flexible preference dataset.
+1. Use the *same model* you will train to generate multiple candidate responses to the same prompt. This creates data that is "on-policy" because it reflects the distribution of outputs the model would naturally produce.
+1. Instead of relying on a stronger model as the reference, introduce an *external grader:* either a verifier or a reward model that scores responses along one or more quality axes (e.g., helpfulness or factual accuracy).
+1. The grader then assigns preference labels among the candidate responses, producing a more nuanced and flexible preference dataset.
This method allows ongoing bootstrapping of preference data as the model improves, but its quality depends heavily on the evaluator's reliability and calibration.
-A nice example of such a dataset is from SnorkelAI, which took the prompts from a popular preference dataset called UltraFeedback, partitioned them into 3 sets, and then applied the above recipe iteratively to improve their model:
+A nice example of such a dataset is from SnorkelAI, who took the prompts from a popular preference dataset called [UltraFeedback](https://huggingface.co/datasets/openbmb/UltraFeedback), partitioned them into 3 sets, and then applied the above recipe iteratively to improve their model:
-At the time of SmolLM3's development, there did not exist any preference data with reasoning traces, so we decided to generate some of our own using the "strong vs weak" approach. We used the prompts from Ai2's Tulu 3 preference mixture to generate responses from Qwen3-0.6B and Qwen3-32B in the `/think` mode. The result was a large-scale dataset of 250k+ LLM-generated preferences, ready to simultaneously improve our SFT checkpoint across multiple axes using preference optimisation algorithms.
+At the time of SmolLM3's development, there did not exist any preference data with reasoning traces, so we decided to generate some of our own using the "strong vs weak" approach. We used the prompts from Ai2's Tulu 3 preference mixture to generate responses from Qwen3-0.6B and Qwen3-32B in the `/think` mode. The result was a [large-scale dataset of 250k+ LLM-generated preferences](https://huggingface.co/datasets/HuggingFaceTB/smoltalk2/viewer/Preference/tulu_3_8b_pref_mix_Qwen3_32B_Qwen3_0.6B_think), ready to simultaneously improve our SFT checkpoint across multiple axes using preference optimisation algorithms.
#### Which algorithm do I pick?
@@ -4066,22 +3936,22 @@ Direct Preference Optimization (DPO) [[@dpo](https://arxiv.org/abs/2305.18290)]
-When the DPO paper came out in mid-2023, there was widespread debate online about whether it could match RL methods, and there were no recipes showing it's effectiveness in non-academic settings. To address that, we released [Zephyr 7B](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) a few months later, training the entire model on synthetic data and showing significant performance gains from DPO.
+When the DPO paper came out in mid-2023, there was heated debate online about whether it could match RL methods, and there were no recipes showing it's effectiveness in industrial settings. To address that, we released [Zephyr 7B](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) a few months later, training the entire model on synthetic data and showing significant performance gains from DPO.
Its appeal came from being simple to implement, stable in practice, and effective even with modest amounts of preference data. As a result, DPO has become the default method to improve SFT models before reaching for more complex techniques like RL.
But researchers quickly discovered there are many ways to improve upon DPO, and nowadays there is a wide variety of alternatives to explore. Below we list a few of the ones we've found most effective:
-- **Kahneman-Tversky Optimisation (KTO) [** [ **@kto** ](https://huggingface.co/papers/2402.01306) **]:** Instead of relying on preference pairs, KTO models where an individual response is "desirable or not", using ideas from human decision making. This is a good choice if you don't have access to paired preference data (e.g. raw responses collected from end-users).
+- **Kahneman-Tversky Optimisation (KTO) [** [ **@kto** ](https://huggingface.co/papers/2402.01306) **]:** Instead of relying on preference pairs, KTO models whether an individual response is "desirable or not", using ideas from human decision making. This is a good choice if you don't have access to paired preference data (e.g. raw responses like 👍 or 👎 collected from end-users).
- **Odds Ratio Preference Optimisation (ORPO) [** [ **@orpo** ](https://huggingface.co/papers/2403.07691) **]:** integrates preference optimisation directly into SFT by adding an odds ratio to the cross-entropy loss. As a result there is no need for a reference model or SFT stage, which makes this method more computationally efficient.
-- **Anchored Preference Optimisation (APO) [** [ **@apo** ](https://huggingface.co/papers/2408.06266) **]:** this is a more controllable objective the explicitly regularises how much the model's likelihoods for chosen vs rejected outputs should shift, rather than just optimising their difference. There are two variants (APO-zero and APO-down) who's choice depends on the relationship between your model and the preference data (i.e. whether the chosen outputs are better than the model or worse).
+- **Anchored Preference Optimisation (APO) [** [ **@apo** ](https://huggingface.co/papers/2408.06266) **]:** this is a more controllable objective the explicitly regularises how much the model's likelihoods for chosen vs. rejected outputs should shift, rather than just optimising their difference. There are two variants (APO-zero and APO-down) who's choice depends on the relationship between your model and the preference data, i.e. whether the chosen outputs are better than the model or worse.
Luckily, many of these choices are just a one-line change in TRL's `DPOTrainer` , so for our initial baseline we did the following:
- Use the prompts and completions from Ai2's [Tülu3 Preference Personas IF dataset](https://huggingface.co/datasets/allenai/tulu-3-pref-personas-instruction-following) to measure the improvements for instruction-following on IFEval with the `/no_think` reasoning mode.
- Re-use the prompts from the above, but now generate "strong vs. weak" preference pairs with Qwen3-32B and Qwen3-0.6B. This gave us preference data for the `/think` reasoning mode.
-- Train for 1 epoch and measure the in-domain improvements on IFEval, along with the out-of-domain impact on other evals like AIME25 which are directly correlated with instruction-following.
+- Train for 1 epoch and measure the *in-domain* improvements on IFEval, along with the *out-of-domain* impact on other evals like AIME25 which are directly correlated with instruction-following.
As shown in the figure below, the in-domain improvements for both reasoning modes were significant: on IFEval, APO-zero improved over the SFT checkpoint by 15-20 percentage points!
@@ -4104,23 +3974,25 @@ For preference optimisation, there are typically only three hyperparameters that
- The β parameter, which typically controls the size of the margin between preference pairs.
- The batch size.
-Let's take a look at how these played out for SmolLM3, starting from the [ `it-SFT` ](https://huggingface.co/HuggingFaceTB/SmolLM3-3B-checkpoints/tree/it-SFT)[ checkpoint](https://huggingface.co/HuggingFaceTB/SmolLM3-3B-checkpoints/tree/it-SFT) we trained over the whole of `smoltalk2` .
+Let's take a look at how these played out for SmolLM3, starting from the [SFT checkpoint](https://huggingface.co/HuggingFaceTB/SmolLM3-3B-checkpoints/tree/it-SFT) we trained over the whole of `smoltalk2` .
- **Learning curves**
+ **Use small learning rates for best performance**
-The first ablation we ran was to check the influence of the learning rate on model performance. We first ran experiments to determine the influence of learning rates between ~200x smaller (1e-7) and ~2x smaller (1e-5) than the SFT learning rate (2e-5). Previous projects like Zephyr 7B had taught us that the best learning rate for PO is around 10x smaller than the one used for SFT, and the ablations we ran for SmolLM3 confirmed this rule of thumb.
+The first ablation we ran was to check the influence of the learning rate on model performance. We ran experiments to determine the influence of learning rates between ~200x smaller (1e-7) and ~2x smaller (1e-5) than the SFT learning rate (2e-5). Previous projects like Zephyr 7B had taught us that the best learning rate for preference optimisation methods is around 10x smaller than the one used for SFT, and the ablations we ran for SmolLM3 confirmed this rule of thumb.
-As shown in the figure below, learning rates ~10x smaller improve the performance of the SFT model in both reasoning modes, but all learning rates beyond that 10x limit result in worse performance for the extended thinking mode. The trend for the mode without extended thinking is more stable, with the best learning rate at 5e-6. This is mostly driven by a single benchmark (LiveCodeBench v4), so we opted for 1e-6 in our SmolLM3 runs.
+As shown in the figure below, learning rates ~10x smaller improve the performance of the SFT model in both reasoning modes, but all learning rates beyond that 10x limit result in worse performance for the extended thinking mode:
+The trend for the `/no_think` reasoning mode is more stable, with the best learning rate at 5e-6. This is mostly driven by a single benchmark (LiveCodeBench v4), so we opted for 1e-6 in our SmolLM3 runs.
+
Our recommendation for your training runs is to run scans of your learning rate at a range of 5x to 20x smaller than your SFT learning rate. It is highly likely that you will find your optimal performance within that range!
- **Tuning β**
+ **Tune your β**
-The experiments we ran for the ß parameter ranged from 0.01 to 0.99 to explore values that encourage different degrees of alignment to the reference model. As a reminder, lower values of beta encourage staying close to the reference model while higher values allow the PO model to match the preference data more closely. The model performance for ß=0.1 is the highest for both reasoning modes and improves compared to the metrics from the SFT checkpoint. Using a low beta value hurts model performance and results in a worse model than the SFT checkpoint, while performance remains stable across multiple ß values without extended thinking.
+The experiments we ran for the ß parameter ranged from 0.01 to 0.99 to explore values that encourage different degrees of alignment to the reference model. As a reminder, lower values of beta encourage staying close to the reference model while higher values allow the model to match the preference data more closely. The model performance for β=0.1 is the highest for both reasoning modes and improves compared to the metrics from the SFT checkpoint. Using a low beta value hurts model performance and results in a worse model than the SFT checkpoint, while performance remains stable across multiple ß values without extended thinking.
These results suggest that values greater than 0.1 are preferable for preference optimisation, and that aligning the model with the preference data is more beneficial than staying close to the reference model. However, we suggest exploring ß values in the range 0.01 and 0.5. Higher values may erase capabilities from the SFT checkpoint that we might not be capturing in the evals shown on the plot.
@@ -4136,11 +4008,13 @@ We also ran experiments to determine how dataset size influences results, testin
-Bringing all these choices together led to the final SmolLM3-3B model, which was best-in-class for its size and sat on the Pareto front with Qwen's own hybrid reasoning models:
+ **Bringing it all together**
+
+Bringing all these threads together produced the final SmolLM3-3B model: best-in-class for its size and sat on the Pareto front with Qwen's own hybrid reasoning models.
-
+
@@ -4157,21 +4031,21 @@ To summarise our findings about preference optimisation that could be useful for
- Scan over β, usually in the range 0.01 to 0.5
- Since most preference algorithms overfit after one epoch, partition your data and train iteratively for best performance.
-Preference optimisation is often the sweet spot between simplicity and performance, but it still inherits a key limitation: it's only as good as the offline preference data you can collect. At some point, static datasets run out of signal and you need methods that can generate fresh training feedback online as the model interacts with prompts and environment. That's where preference optimisation meets the broader family of *online and RL-based methods.*
+Preference optimisation is often the sweet spot between simplicity and performance, but it still inherits a key limitation: it's only as good as the offline preference data you can collect. At some point, static datasets run out of signal and you need methods that can generate fresh training feedback online as the model interacts with prompts and environment. That's where preference optimisation meets the broader family of *on-policy and RL-based methods.*
-### Going online and beyond supervised labels
+### Going on-policy and beyond supervised labels
If you want your model to consistently solve math problems, generate executable code, or plan across multiple steps, you often need a **reward signal** rather than just "A is better than B".
This is where RL starts to make sense. Instead of supervising the model with preferences, you let it interact with an environment (which could be a math verifier, a code executor, or even real user feedback), and learn directly from the outcomes. RL shines when:
-- **You can check correctness automatically** (e.g., unit tests, mathematical proofs, API calls) or have access to a high-quality verifier or reward model.
+- **You can check correctness automatically,** e.g., unit tests, mathematical proofs, API calls, or have access to a high-quality verifier or reward model.
- **The task requires multi-step reasoning or planning** , where local preferences may not capture long-term success.
- **You want to optimise for objectives beyond preference labels** , like passing unit tests for code or maximising some objective.
When it comes to LLMs, there are two main flavours of RL:
-- **Reinforcement Learning from Human Feedback (RLHF):** this is the approach that was popularised by OpenAI's InstructGPT paper [[@instructgpt](https://huggingface.co/papers/2203.02155)] and the basis for gpt-3.5 and many modern models. Here, human annotators compare model outputs (e.g. "A is better than B") and a reward model is trained to predict those preferences. The policy is then fine-tuned with RL to maximise the learned reward.
+- **Reinforcement Learning from Human Feedback (RLHF):** this is the approach that was popularised by OpenAI's InstructGPT paper [[@instructgpt](https://huggingface.co/papers/2203.02155)] and the basis for gpt-3.5 and many modern LLMs. Here, human annotators compare model outputs (e.g. "A is better than B") and a reward model is trained to predict those preferences. The policy is then fine-tuned with RL to maximise the learned reward.
@@ -4182,16 +4056,16 @@ Because the reward model only approximates human preferences, it can sometimes e
Both RLHF and RLVR define *what* the model is being optimised for, but they don't tell us *how* that optimisation should be carried out. In practice, the efficiency and stability of RL-based training depends heavily on whether the learning algorithm is **on-policy** or **off-policy** .
-Methods such as GRPO typically fall into the category of on-policy optimisation algorithms, where the model (the policy) that generates the completions is the same as the one being optimised. While it is broadly the case that GRPO is an on-policy algorithm, there are a few caveats. First, to optimise the generation step, several batches of generations may be sampled and then **k** updates are made to the model, with the first batch being on-policy with the next few batches being slightly off-policy.
+Methods such as GRPO typically fall into the category of on-policy optimisation algorithms, where the model (the policy) that generates the completions is the same as the one being optimised. While it is broadly the case that GRPO is an on-policy algorithm, there are a few caveats. First, to optimise the generation step, several batches of generations may be sampled and then $k$ updates are made to the model, with the first batch being on-policy with the next few batches being slightly off-policy.
To account for policy-lag between the model used for generation and the current model being optimised, importance sampling and clipping are used to re-weight the token probabilities and restrict the size of the updates.
-We mention here off-policy RL, but there are several truly off-policy RL algorithms, such as Q-Learning, where the policy used for generate trajectories can be totally different to the policy being optimized. When GRPO is applied to LLMs the policy used for generation can lag behind the one used for optimization, but typically there are less than 16 steps difference between the two.
+We mention here off-policy RL, but there are several truly off-policy RL algorithms, such as Q-learning, where the policy used for generate trajectories can be totally different to the policy being optimized. When GRPO is applied to LLMs, the policy used for generation can lag behind the one used for optimization, but typically there are less than 16 steps difference between the two.
-As autoregressive generation from LLMs is slow, many frameworks like verl and PipelineRL have added asynchronous generation of completions and "in-flight" updates of model weights to maximise training throughput. These approaches require more complex and careful implementation, but can achieve training speeds that are 4-5x higher than synchronous training methods. As we'll see later, these improvements in training efficiency are especially pronounced for reasoning models, which have long-tail token distributions.
+As autoregressive generation from LLMs is slow, many frameworks like [verl](https://github.com/volcengine/verl) and [PipelineRL](https://github.com/ServiceNow/PipelineRL) have added asynchronous generation of completions and "in-flight" updates of model weights to maximise training throughput. These approaches require more complex and careful implementation, but can achieve training speeds that are 4-5x higher than synchronous training methods. As we'll see later, these improvements in training efficiency are especially pronounced for reasoning models, which have long-tail token distributions.
For SmolLM3, we skipped RL altogether, mostly due to time constraints and having a model that was already best-in-class with offline preference optimisation. However, since the release, we have revisited the topic and will close out the post-training chapter by sharing some of our lessons from applying RLVR to hybrid reasoning models.
@@ -4203,19 +4077,19 @@ Hybrid reasoning models pose additional complexity for RLVR because generation l
-As you can see, the `/no_think` mode generates solutions of around 2k tokens, while the `/think` mode is much larger with 16k tokens and a fat-tailed distribution. Ideally, we would like to improve the overall performance of both modes with RLVR, without changing their respective length distributions too radically.
+As you can see, the `/no_think` mode generates solutions with a median length of around 2k tokens, while the `/think` mode is much larger with 16k tokens and a fat-tailed distribution. Ideally, we would like to improve the overall performance of both modes with RLVR, without changing their respective length distributions too radically.
To explore this, we focused on optimising the `/no_think` mode first and took a subset of prompts from [Big-Math](https://huggingface.co/datasets/SynthLabsAI/Big-Math-RL-Verified), a dataset of over 250k math problems with verified answers.
-To our surprise, naively applying GRPO leads to a form of reward hacking: despite being never being prompted to emit a long CoT, the model learns to exploit its base reasoning capabilities to increase the reward. As shown in the figure below, not only does the reward go up, but so too does the length of the completions:
+To our surprise, naively applying GRPO leads to a form of *reward hacking* : despite never being prompted to emit a long CoT, the model learns to exploit its base reasoning capabilities to increase the reward. As shown in the figure below, not only does the reward go up, but so too does the length of the completions:
-
+
In other words, RLVR with GRPO has converted the `/no_think` mode of our reasoning model into one which looks very similar to the `/think` mode!
-When we review the model's completions, we see that the model now generates long CoT and even includes classic reasoning model grammar such as "Wait, …". An example of this is shown below:
+When we review the model's completions, we see that the model now generates a long CoT and even includes some of the cognitive behaviours [@cognitivebehaviours] like "Wait, …" that are associated with reasoning models. An example of this is shown below:
```
However, since the jogger and the train are moving in the same direction, the relative speed between them is the difference of their speeds:
@@ -4238,9 +4112,10 @@ But first, the initial position: the jogger is 340 meters ahead of the engine. S
**Wait** the problem says: "A jogger running at 10 km/hr alongside a railway track is 340 m ahead of the engine of a train running at 46 km/hr in the same direction. The train takes 46 seconds to pass the jogger. How long is the train in meters?"
Hmm, so the jogger is 340 m ahead of the engine along the track. Since they're moving in the same direction, the train is behind the jogger by 340 meters. To pass the jogger, the train has to cover that 340 meters plus the length of the train itself?
-
```
-This issue can mitigated with an additional overlong completion penalty, that penalizes completions over a certain length. The penalty is parameterized by two arguments max completion length $L_{max}$ and soft punishment cache $L_{cache}$ . This penalty was one of the improvements proposed in the DAPO paper [[@dapo](https://huggingface.co/papers/2503.14476)] and amounts to applying a reward function as follows:
+ **Mitigating reward hacking with overlong penalties**
+
+This issue can mitigated by including an *overlong completion penalty* , that penalises completions over a certain length. The penalty is parameterised by two arguments max completion length $L_{max}$ and soft punishment cache $L_{cache}$ . This penalty was one of the improvements proposed in the DAPO paper [[@dapo](https://huggingface.co/papers/2503.14476)] and amounts to applying a reward function as follows:
$$
@@ -4252,54 +4127,59 @@ R_{\text{length}}(y) = \begin{cases}
$$
-Using this penalty you can directly control the model's output distribution and the tradeoff between increasing length and performance. Where we vary the overlong penalty from 1.5k to 4k in steps of 512 tokens.
+Using this penalty, we can directly control the model's output distribution and measure the tradeoff between increasing response length and performance. An example is shown in the figure below, where we vary the overlong penalty from 1.5k to 4k in steps of 512 tokens:
-
+
-This tradeoff is better shown in the downstream performance on the AIME25 benchmark:
+The tradeoff between response length and performance is clearer when we examine the improvements on AIME25:
-
+
-If we take the checkpoints from step 400 for the different overlong penalties, we can compare the output token distributions and find a compromise of performance and completion length:
+Now we can clearly see how the overlong penalty impacts downstream performance, with penalties in the range 2-4k producing significant improvements, while keeping the token distribution in check. As shown in the figure below, if we take the checkpoints from step 400, we can compare the output token distributions between the initial policy and final model across a range of different penalties:
-We have shown that with a carefully selected reward function and overlong penalty, we can improve Smollm3 instruction mode performance with Reinforcement Learning. The next step in the RL training pipeline would be joint training of the model in both instruct and reasoning mode. With a more relaxed overlong penalty for the reasoning completions. We find however that joint training is challenging. We have observed that trying to improve model performance across a variety of benchmarks, in both instruct and reasoning modes, is a challenging endeavour. It appears that most open models such as the latest Qwen3 models have been released as separate instruct and reasoning variant, we believe the industry is also struggling to balance the performance in these two modes.
+ **Bringing it all together**
+
+We find that applying a length penalty in the range 2.5-3k gives the best tradeoff between performance and response length, with the figure below showing that GRPO nearly doubles the performance on AIME 2025 over offline methods like APO:
+
+
+
+
+
+Now that we know how to improve performance in the `/no_think` reasoning mode, the next step in the RL training pipeline would be *joint training* of the model in both reasoning modes at once. However, we have found this to be quite a tough nut to crack because each mode requires it's own length penalty and the interplay has thus far produced unstable training. This highlights the main challenge with trying to apply RL on hybrid reasoning models, and we can see this reflected in a new trend from model developers like Qwen to release the [instruct](https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507) and [reasoning](https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507) variants separately.
Our experiments show that RLVR can steer reasoning behaviour effectively, but only with careful reward shaping and stability mechanisms. Given this complexity, it's worth asking whether reinforcement learning is the only viable path forward. In fact, several lighter-weight, on-policy optimisation strategies have been proposed in recent literature, yet remain surprisingly under explored by the open-source community. Let's close out this chapter by taking a look at some of them.
#### Is RL the only game in town?
-Other approaches to online learning extend preference learning and distillation into iterative loops that refresh the training signal as the model evolves:
+Other approaches to on-policy learning extend preference optimisation and distillation into iterative loops that refresh the training signal as the model evolves:
-- **Online DPO:** rather than training once on a fixed preference dataset, the model continually samples new responses, collects fresh preference labels (from humans, reward models, or stronger LLMs), and updates itself. This keeps the optimisation *on-policy* and reduces drift between training data and the model's current behaviour.
-- **Online knowledge distillation:** instead of preferences, the signal comes from a stronger teacher model. The student samples responses at every training step and the KL divergence between the student and teacher logits on these samples provides the learning signal. This allows the student to continuously absorb the teacher's capabilities, without needing explicit preference labels or verifiers.
+- **Online DPO:** rather than training once on a fixed preference dataset, the model continually samples new responses, collects fresh preference labels (from reward models or LLM graders), and updates itself. This keeps the optimisation *on-policy* and reduces drift between training data and the model's current behaviour [@onlinedpo].
+- **On-policy distillation:** instead of preferences, the signal comes from a stronger teacher model. The student samples responses at every training step and the KL divergence between the student and teacher logits on these samples provides the learning signal. This allows the student to continuously absorb the teacher's capabilities, without needing explicit preference labels or verifiers [@gkd].
These methods blur the line between static preference optimisation and full RL: you still get the benefits of adapting to the model's current distribution, but without the full complexity of designing and stabilising a reinforcement learning loop.
#### Which method do I pick?
-Although there are a gazillion research papers about which online method is "best", in practice the decision depends on a few factors shown in the table below:
+Although there are a gazillion research papers about which on-policy method is "best", in practice the decision depends on a few factors shown in the table below:
| Algorithm | When to Use | Tradeoffs | Best for Model Size |
| --- | --- | --- | --- |
-| **Online DPO** | You can get preference labels cheaply (from humans, reward models, or stronger LLMs). Best for aligning behaviour with evolving distributions. | Easy to scale iteratively, more stable than RL, but depends on label quality and coverage. Supported in few training frameworks. | Mid–large models (10B+), where preferences capture improvements beyond imitation. |
-| **Online knowledge distillation** | You have access to a stronger teacher model and want to transfer capabilities efficiently. | Simple to implement, cheap to run, inherits teacher biases, ceiling limited by teacher. Supported only in TRL | Most effective for small to mid-sized models (\<30B). |
-
-| **Reinforcement learning** | Best when you have verifiable rewards (unit tests, math proofs, API success rates) or tasks requiring multi-step reasoning/planning.
-
-Can be used with reward models, but there are challenges like reward-hacking; where the model takes advantages in weaknesses in the reward model. | Flexible and powerful, but costly and harder to stabilise; requires careful reward shaping. Supported in most post-training frameworks. | Large models (20B+), where extra capacity lets them exploit structured reward signals. |
+| **Online DPO** | You can get preference labels cheaply. Best for aligning behaviour with evolving distributions. | Easy to scale iteratively, more stable than RL, but depends on label quality and coverage. Supported in few training frameworks. | Any size, where preferences capture improvements beyond imitation. |
+| **On-policy distillation** | You have access to a stronger teacher model and want to transfer capabilities efficiently. | Simple to implement, cheap to run, inherits teacher biases, ceiling limited by teacher. Supported only in TRL and NemoRL | Most effective for small to mid-sized models (\<30B). |
+| **Reinforcement learning** | Best when you have verifiable rewards or tasks requiring multi-step reasoning/planning. Can be used with reward models, but there are challenges like reward-hacking, where the model takes advantage in weaknesses in the reward model. | Flexible and powerful, but costly and harder to stabilise; requires careful reward shaping. Supported in most post-training frameworks. | Mid to large models (20B+), where extra capacity lets them exploit structured reward signals. |
-In the open-source ecosystem, reinforcement learning methods like GRPO and REINFORCE tend to be the most widely used, although the Qwen3 tech report [[@qwen3](https://arxiv.org/abs/2505.09388)] highlighted the use of online knowledge distillation to train the models under 32B:
+In the open-source ecosystem, reinforcement learning methods like GRPO and REINFORCE tend to be the most widely used, although the Qwen3 tech report [[@qwen3](https://arxiv.org/abs/2505.09388)] highlighted the use of on-policy distillation to train the models under 32B parameters:
@@ -4330,18 +4210,24 @@ flowchart LR
```
-Comparison of reinforcement learning and on-policy distillation on Qwen3-8B. Numbers in parentheses indicate pass@64 scores.
+One interesting property of on-policy distillation with small models is that it typically outperforms RL-based methods at a fraction of the compute cost. This is because instead of generating multiple rollouts per prompt, we only sample one, which is then graded by the teacher in a single forward-backward pass. As the Qwen3 tech report shows, the gains over GRPO can be significant:
| Method | AIME'24 | AIME'25 | MATH500 | LiveCodeBench v5 | MMLU -Redux | GPQA -Diamond | GPU Hours |
| --- | --- | --- | --- | --- | --- | --- | --- |
-| Off-policy Distillation | 55.0 (90.0) | 42.8 (83.3) | 92.4 | 42.0 | 86.4 | 55.6 | - |
-| + Reinforcement Learning | 67.6 (90.0) | 55.5 (83.3) | 94.8 | 52.9 | 86.9 | 61.3 | 17,920 |
-| + On-policy Distillation | **74.4 (93.3)** | **65.5 (86.7)** | **97.0** | **60.3** | **88.3** | **63.3** | 1,800 |
+| Off-policy Distillation | 55.0 | 42.8 | 92.4 | 42.0 | 86.4 | 55.6 | - |
+| + Reinforcement Learning | 67.6 | 55.5 | 94.8 | 52.9 | 86.9 | 61.3 | 17,920 |
+| + On-policy Distillation | **74.4** | **65.5** | **97.0** | **60.3** | **88.3** | **63.3** | 1,800 |
-Similarly, researchers at FAIR have compared the effect of being fully offline to online for DPO and shown that it's possible to match the performance of GRPO using far less compute [[@online-offline](https://huggingface.co/papers/2506.21495)]:
+More recently, [Thinking Machines](https://thinkingmachines.ai/blog/on-policy-distillation/) have shown that on-policy distillation is also effective at mitigating *catastrophic forgetting* , where a post-trained model is further trained on a new domain and it's prior performance regresses. In the table below, they show that although the chat performance of Qwen3-8b (IFEval) tanks when it's fine-tuned on internal data, the behaviour can be restored with cheap distillation:
+
+
+
+We ourselves are quite excited by on-policy distillation as there's a huge diversity of capable, open-weight LLMs that can be distilled into smaller, task-specific models. However, one weakness with all on-policy distillation methods is that the teacher and student must share the same tokenizer. To address that, we've developed a new method called General On-Policy Logit Distillation (GOLD), which allows any teacher to be distilled into any student. We recommend checking out our [technical write-up](https://huggingface.co/spaces/HuggingFaceH4/on-policy-distillation) if you're interested in these topics.
+
+Similarly, researchers at FAIR have compared the effect of being fully off-policy to on-policy for DPO and shown that it's possible to match the performance of GRPO using far less compute [[@online-offline](https://huggingface.co/papers/2506.21495)]:
@@ -4350,16 +4236,18 @@ Similarly, researchers at FAIR have compared the effect of being fully offline t
-As shown in their paper, online DPO works well for math tasks and even the semi-online variant achieves comparable performance despite being quite off-policy:
+As shown in their paper, online DPO works well for math tasks and even the semi-on-policy variant achieves comparable performance despite being many steps off-policy:
| Training method | Math500 | NuminaMath | AMC23 |
| --- | --- | --- | --- |
-| Seed (Llama-3.1-8B-Instruct) | 47.4 (1.6) | 33.9 (0.6) | 23.7 (5.2) |
-| Offline DPO (s = ∞) | 53.7 (1.6) | 36.4 (0.6) | 28.8 (7.0) |
-| Semi-online DPO (s = 100) | **58.9** (1.2) | 39.3 (0.4) | **35.1** (5.3) |
-| Semi-online DPO (s = 10) | 57.2 (1.1) | 39.4 (0.5) | 31.4 (4.3) |
-| Online DPO (s = 1) | 58.7 (1.2) | **39.6** (0.5) | 32.9 (5.2) |
-| GRPO | 58.1 (1.3) | 38.8 (0.5) | 33.6 (5.1) |
+| Seed (Llama-3.1-8B-Instruct) | 47.4 | 33.9 | 23.7 |
+| Offline DPO (s = ∞) | 53.7 | 36.4 | 28.8 |
+| Semi-online DPO (s = 100) | **58.9** | 39.3 | **35.1** |
+| Semi-online DPO (s = 10) | 57.2 | 39.4 | 31.4 |
+| Online DPO (s = 1) | 58.7 | **39.6** | 32.9 |
+| GRPO | 58.1 | 38.8 | 33.6 |
+
+Overall, we feel that there still remains much to be done with both scaling RL effectively [@scalerl] and exploring other methods for computational efficiency. Exciting times indeed!
### Wrapping up post-training
@@ -4375,27 +4263,25 @@ But as you've probably realised, knowing how to train great models is only half
## Infrastructure - the unsung hero
-Now that you know all that we know about model creation and training, let's focus on the one **underrated** component which can make or break your project (and your bank account) if you don't understand it properly: infrastructure. Whether you focus on framework development, model architecture, or even data curation, understanding infrastructure basics will help you identify bottlenecks in your training pipeline, make informed decisions about parallelism strategies, and debug throughput issues. (At the very least it'll help you communicate more effectively with your infrastructure colleagues 😉).
+Now that you know everything we know about model creation and training, let's address the critical yet *underrated* component that can make or break your project (and your bank account): infrastructure. Whether you focus on frameworks, architecture, or data curation, understanding infrastructure basics helps identify training bottlenecks, optimise parallelism strategies, and debug throughput issues. (At the minimum, it improves communication with infrastructure teams 😉).
Most people training models care deeply about architecture and data, yet very few understand the infrastructure details. Infrastructure expertise typically lives with framework developers and cluster engineers, and gets treated by the rest as a solved problem: rent some GPUs, install PyTorch, and you're good to go. We trained SmolLM3 on 384 H100s for nearly a month, processing a total of 11 trillion tokens… and this was not a smooth ride! During that time, we dealt with node failures, storage issues and run restarts (see the [training marathon section](#the-training-marathon)). You need to have good contingency plans and strategies to prepare for these issues, and keep training smooth and low-maintenance.
-The results reveal an important characteristic of memory
-
This chapter aims to bridge that knowledge gap. Think of it as a practical guide to the hardware layer, focused on the questions that matter for training. (Note: Each subsection starts with a TL;DR so you can choose your depth level.)
-The first two sections tackles the fundamentals of how hardware works: what does a GPU actually consist of? How does memory hierarchy work? How do CPUs and GPUs communicate? We'll also go over you what to consider when acquiring GPUs and how to test them before committing to long training runs. Most importantly, we'll show you at each step how to measure and diagnose these systems yourself. The next sections are then more applied, and we'll see how to make your infra resilient to failure, and how to maximally optimize your training throughput.
+The first two sections tackle the fundamentals of how hardware works: what does a GPU actually consist of? How does memory hierarchy work? How do CPUs and GPUs communicate? We'll also go over you what to consider when acquiring GPUs and how to test them before committing to long training runs. Most importantly, we'll show you at each step how to measure and diagnose these systems yourself. The next sections are then more applied, and we'll see how to make your infra resilient to failure, and how to maximally optimize your training throughput.
The name of the game of this chapter is to find and fix the bottlenecks!
-Think of this as building your intuition for why certain design decisions matter. When you understand that your model's activations need to flow through multiple levels of cache, each with different bandwidth and latency characteristics, you'll naturally start thinking about how to structure your training to minimize data movement. When you see that inter-node communication is orders of magnitude slower than intra-node, you'll understand why parallelism strategies matter so much.
+Think of this as building your intuition for why certain design decisions matter. When you understand that your model's activations need to flow through multiple levels of cache, each with different bandwidth and latency characteristics, you'll naturally start thinking about how to structure your training to minimise data movement. When you see that inter-node communication is orders of magnitude slower than intra-node, you'll understand why parallelism strategies matter so much.
Let's start by cracking open a GPU and seeing what's inside.
### Inside a GPU: Internal Architecture
-A GPU is fundamentally a massively parallel processor optimized for throughput over latency. Unlike CPUs, which excel at executing a few complex instruction streams quickly, GPUs achieve performance by executing thousands of simple operations simultaneously.
+A GPU is fundamentally a massively parallel processor optimised for throughput over latency. Unlike CPUs, which excel at executing a few complex instruction streams quickly, GPUs achieve performance by executing thousands of simple operations simultaneously.
-The key to understanding GPU performance lies in recognizing that it's not just about raw compute power, it's about the interplay between computation and data movement. A GPU can have teraflops of theoretical compute, but if data can't reach the compute units fast enough, that potential goes unused. This is why we need to understand both the memory hierarchy (how data moves) and the compute pipelines (how work gets done).
+The key to understanding GPU performance lies in recognising that it's not just about raw compute power, it's about the interplay between computation and data movement. A GPU can have teraflops of theoretical compute, but if data can't reach the compute units fast enough, that potential goes unused. This is why we need to understand both the memory hierarchy (how data moves) and the compute pipelines (how work gets done).
At the highest level, a GPU therefore performs two essential tasks:
@@ -4406,7 +4292,7 @@ At the highest level, a GPU therefore performs two essential tasks:
-TL;DR: GPUs measure performance in FLOPs (floating-point operations per second). Modern GPUs like the H100 deliver dramatically higher throughput at lower precision: 990 TFLOPs at BF16 vs 67 TFLOPs at FP32. However, real-world performance is 70-77% of theoretical peaks due to memory bottlenecks. State-of-the-art training achieves 20-41% end-to-end efficiency (MFU). Use realistic numbers, not marketing specs, when planning training runs.
+TL;DR: GPUs measure performance in FLOPs (floating-point operations per second). Modern GPUs like the H100 deliver dramatically higher throughput at lower precision: 990 TFLOPs at BF16 vs 67 TFLOPs at FP32. However, real-world performance is 70-77% of theoretical peaks due to memory bottlenecks. State-of-the-art training achieves 20-41% end-to-end efficiency, also known as model flops utilization (MFU). Use realistic numbers, not marketing specs, when planning training runs.
GPU compute performance is measured in **FLOPs** (floating-point operations per second). A FLOP is a single arithmetic operation, typically a floating numbers addition like `a + b` , and modern GPUs can execute trillions of these per second (TFLOPs).
@@ -4415,6 +4301,11 @@ The fundamental building blocks of GPU compute are **Streaming Multiprocessors
Modern GPUs organize hundreds of these SMs across the chip! For example, the [H100](https://developer.nvidia.com/blog/nvidia-hopper-architecture-in-depth/) SXM5 version (which is the GPU we're using on our cluster) contains 132 SMs. Each SM operates independently, executing groups of 32 threads called **warps** in lockstep. To help there, the SMs rely on another component, the **warp schedulers:** by balancing instructions to different warps, they enable the SM to "hide latency" by switching between warps when one is held up. This **SIMT** (Single Instruction, Multiple Thread) execution model means all threads in a warp execute the same instruction simultaneously on different data.
+
+
+Warps are named in reference to weaving, "the first parallel thread technology", according to [Lindholm et al., 2008](https://www.cs.cmu.edu/afs/cs/academic/class/15869-f11/www/readings/lindholm08_tesla.pdf). The equivalent of warps in other GPU programming models include [subgroups ](https://github.com/gpuweb/gpuweb/pull/4368) in WebGPU, [waves ](https://microsoft.github.io/DirectX-Specs/d3d/HLSL_SM_6_6_WaveSize.html) in DirectX, and [simdgroups ](https://developer.apple.com/documentation/metal/compute_passes/creating_threads_and_threadgroups#2928931) in Metal.
+
+
@@ -4447,11 +4338,6 @@ The dramatic increase in throughput at a lower precision isn't just about raw sp
For SmolLM3, we were going to train on NVIDIA H100 80GB HBM3 GPUs, so we first wanted to test the H100's theoretical TFLOPs specifications against real world performance. For this, we used the [SemiAnalysis GEMM benchmark](https://www.ray.so/#theme=prisma&darkMode=false&code=IyBBTUQgVklQIGltYWdlCmFsaWFzIGRydW49InN1ZG8gZG9ja2VyIHJ1biAtLXByaXZpbGVnZWQgLS1uZXR3b3JrPWhvc3QgLS1kZXZpY2U9L2Rldi9rZmQgLS1kZXZpY2U9L2Rldi9kcmkgLS1ncm91cC1hZGQgdmlkZW8gLS1jYXAtYWRkPVNZU19QVFJBQ0UgLS1zZWN1cml0eS1vcHQgc2VjY29tcD11bmNvbmZpbmVkIC0taXBjPWhvc3QgLS1zaG0tc2l6ZT0xOTI2IC0tcm0gLWl0IgpkcnVuIHNlbWlhbmFseXNpc3dvcmsvYW1kLW1hdG11bDpsYXRlc3QKRElTQUJMRV9BREROX0hJUF9MVD0wIFBZVE9SQ0hfVFVOQUJMRV9PUF9FTkFCTEVEPTEgcHl0aG9uIG1hdG11bC5weQoKI0FNRCBweXBpIG5pZ2h0bHkKZHJ1biBhbWQtbGF0ZXN0LXB5cGktbmlnaHRseS1tYXRtdWwKUFlUT1JDSF9UVU5BQkxFX09QX0VOQUJMRUQ9MSBweXRob24gbWF0bXVsLnB5CgojIEFNRCBweXBpIHN0YWJsZSBQeVRvcmNoIDIuNS4xCmRydW4gc2VtaWFuYWx5c2lzd29yay9hbWQtbGF0ZXN0LXB5cGktc3RhYmxlLW1hdG11bApQWVRPUkNIX1RVTkFCTEVfT1BfRU5BQkxFRD0xIHB5dGhvbiBtYXRtdWwucHkKCiMgTnZpZGlhIHN0YWJsZSAyNC4wOQphbGlhcyBkcnVuPSJkb2NrZXIgcnVuIC0tcm0gLWl0IC0tZ3B1cyBhbGwgLS1pcGM9aG9zdCAtLW5ldD1ob3N0IC0tc2htLXNpemU9MTkyNiIKZHJ1biBzZW1pYW5hbHlzaXN3b3JrL252aWRpYS1tYXRtdWw6bGF0ZXN0CnB5dGhvbiBtYXRtdWwucHkKCg&language=shell): it [tests throughput on real-world matrix multiplication shapes from Meta's Llama 70B training](https://newsletter.semianalysis.com/p/mi300x-vs-h100-vs-h200-benchmark-part-1-training#general-matrix-multiply-gemm-performance).
-```
-## Using docker image semianalysiswork/nvidia-matmul:latest (nvidia stable 24.09)
-$ python matmul.py
-```
-
@@ -4469,9 +4355,14 @@ $ python matmul.py
**Validating theoretical performance** : Our experiments revealed the gap between theoretical peaks and achievable performance.
-For **FP64** and **FP32** operations, we achieved 49-56 TFLOPs and 356-396 TFLOPs respectively, representing 98-112% and 71-79% of their theoretical peaks. While these show excellent hardware utilization, these precisions are rarely used in modern deep learning training due to their computational cost.
+For **FP64 Tensor Core** operations, we achieved 49-56 TFLOPs, representing 74-84% of the theoretical peak (67 TFLOPs). For **TF32** (TensorFloat-32, which PyTorch uses by default for FP32 tensors on Tensor Cores), we achieved 356-396 TFLOPs, representing 72-80% of the theoretical peak (~495 TFLOPs dense). While these show excellent hardware utilization, these precisions are rarely used in modern deep learning training: FP64 due to its computational cost, and TF32 because lower precisions like BF16 and FP8 offer better performance.
+
+
+
+[NVIDIA specs](https://www.nvidia.com/en-us/data-center/h100/) often list sparse performance (989 TFLOPs for TF32) which assumes 2:4 structured sparsity patterns. Dense operations, which our benchmark tests, achieve roughly half the sparse peak (~495 TFLOPs).
+
-For **BF16** operations, we consistently achieved 714-758 TFLOPs across different matrix shapes, approximately 72-77% of the H100's theoretical 990 TFLOPs peak. This is, in practice, an excellent utilization rate for a real-world workload!
+For **BF16** operations, we consistently achieved 714-758 TFLOPs across different matrix shapes, approximately 72-77% of the H100's theoretical 990 TFLOPs peak. This is, in practice, an excellent utilisation rate for a real-world workload!
@@ -4482,15 +4373,13 @@ For **BF16** operations, we consistently achieved 714-758 TFLOPs across differ
State-of-the-art MFU in training: Meta achieved 38-41% when training Llama 3 405B, while DeepSeek-v3 reached ~20-30% on GPUs with tighter communication bottlenecks related to the MoE architecture. For SmolLM3, we achieved ~30% MFU as we'll see later. Much of the gap comes from inter-node communication overhead in distributed training. Given our kernel-level ceiling of ~77%, these end-to-end numbers represent roughly 50-55% efficiency relative to achievable matmul performance. Inference workloads can reach higher MFU >70%, closer to raw matmul performance, though published results from production deployments are scarce.
-The FP8 results are more nuanced. Let's look at our results on 3 different matrix multiplication methods/kernels.
+The **FP8** results are more nuanced. Let's look at our results on 3 different matrix multiplication methods/kernels.
-A [kernel](https://modal.com/gpu-glossary/device-software/kernel) is the unit of CUDA code.
+A kernel is the unit of CUDA code.
-
-
Using PyTorch's `torch._scaled_mm` kernel with e4m3 precision, we achieved 1,210-1,457 TFLOPs depending on the matrix shape, roughly 31-37% of the theoretical 3,960 TFLOPs peak. 😮 Why? This lower utilization percentage (in FP8) actually doesn't indicate poor performance; rather, it reflects that these operations become increasingly memory-bound as compute throughput grows. The [Tensor Cores](https://www.nvidia.com/en-us/data-center/tensor-cores/) can process FP8 data faster than the memory system can deliver it, making memory bandwidth the limiting factor.
The [Transformer Engine](https://github.com/NVIDIA/TransformerEngine)'s `TE.Linear` achieved 547-1,121 TFLOPs depending on the shape, while `torch._scaled_mm` consistently delivered higher throughput. This highlights an important lesson: ***kernel implementation matters*** *significantly, and the choice of API can impact performance by 2-3x even when targeting the same hardware capabilities.*
@@ -4516,7 +4405,7 @@ In order to make calculations, GPUs need to read/write to memory, so it's import
-TL;DR: GPUs organize memory in a hierarchy from fast-but-small (registers, shared memory) to slow-but-large (HBM main memory). Understanding this hierarchy is critical because modern AI is often memory-bound: the bottleneck is moving data, not computing on it. Operator fusion (like Flash Attention) achieves 2-4× speedups by keeping intermediate results in fast on-chip memory instead of writing to slow HBM. Benchmarks show H100's HBM3 delivers ~3 TB/s in practice, matching theoretical specs for large transfers.
+TL;DR: GPUs organize memory in a hierarchy from fast-but-small (registers, shared memory) to slow-but-large (HBM main memory). Understanding this hierarchy is critical because modern AI is often memory-bound: the bottleneck is moving data, not computing on it. Operator fusion (like Flash Attention) achieves 2-4× speedups by keeping intermediate results in fast on-chip memory instead of writing to slow HBM. Benchmarks show H100's HBM3 delivers ~3 TB/s in practice, matching theoretical specs for large transfers.
To visualize how memory operations flow through a GPU in practice, let's first look at the [Memory Chart from NVIDIA Nsight Compute](https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#memory-chart), a profiling graph which shows a graphical representation of how data moves between different memory units for any kernel of your choice:
@@ -4542,15 +4431,18 @@ It provides several key insights:
- **Memory access patterns** : The flow between logical and physical units shows whether your kernel has good spatial/temporal locality
- **Port utilization** : Individual memory ports may be saturated even when aggregate bandwidth appears underutilized
-In our specific case above, you can see how kernel instructions flow through the memory hierarchy (for FP64 matrix multiplications on our hardware): global load instructions generate requests to the L1/TEX cache, which may hit or miss and generate further requests to L2, which ultimately accesses device memory (HBM) on misses. The colored rectangles inside units show port utilization—even if individual links operate below peak, the shared data port may be saturated.
+In our specific case above, you can see how kernel instructions flow through the memory hierarchy (for FP64 matrix multiplications on our hardware): global load instructions generate requests to the L1/TEX cache, which may hit or miss and generate further requests to L2, which ultimately accesses device memory (HBM) on misses. The colored rectangles inside units show port utilization, even if individual links operate below peak, the shared data port may be saturated.
+
+
For optimal performance, aim to minimize traffic to slower memory tiers (HBM) while maximizing utilization of faster tiers (shared memory, registers).
+
-Now let's understand the underlying memory hierarchy that makes this chart possible.
+Now let's understand the underlying memory hierarchy that makes this chart possible. Modern GPUs organize memory in a hierarchy that balances speed, capacity, and cost, a design dictated by fundamental physics and circuit constraints.
-Modern GPUs organize memory in a hierarchy that balances speed, capacity, and cost, a design dictated by fundamental physics and circuit constraints.
+
+Source">
@@ -4565,7 +4457,7 @@ Moving up the hierarchy toward the compute units, we find progressively faster b
This hierarchy exists because SRAM (used for caches and registers) is fast but physically large and expensive, while DRAM (used for HBM) is dense and cheap but slower. The result: fast memory comes in small quantities close to compute, backed by progressively larger pools of slower memory further away.
- **Why this matters** : Understanding this hierarchy is essential for kernel optimization. The key insight is that memory-bound operations are limited by how fast you can move data, not how fast you can compute. As [Horace He](https://upload.wikimedia.org/wikipedia/commons/b/b2/Hausziege_04.jpg) explains in [Making Deep Learning Go Brrrr From First Principles](https://horace.io/brrr_intro.html), "load from memory" → "multiply by itself twice" → "write to memory" takes essentially the same time as "load from memory" → "multiply by itself once" → "write to memory": the computation is "free" compared to the memory access.
+ **Why this matters** : Understanding this hierarchy is essential for kernel optimization. The key insight is that memory-bound operations are limited by how fast you can move data, not how fast you can compute. As [Horace He](https://upload.wikimedia.org/wikipedia/commons/b/b2/Hausziege_04.jpg) explains in [Making Deep Learning Go Brrrr From First Principles](https://horace.io/brrr_intro.html), *"load from memory" → "multiply by itself twice" → "write to memory"* takes essentially the same time as *"load from memory" → "multiply by itself once" → "write to memory"* : the computation is "free" compared to the memory access.
This is why **operator fusion** is so powerful: by combining multiple operations into a single kernel, you can keep intermediate results in fast SRAM instead of writing them back to slow HBM between operations. Flash Attention is a perfect example of this principle in action.
@@ -4587,7 +4479,7 @@ This is why **operator fusion** is so powerful: by combining multiple operatio
Only the final output is written back to HBM
-The result: Flash Attention reduces HBM accesses from O(N²) to O(N), transforming a memory-bound operation into one that better utilizes the GPU's compute capabilities. This is the essence of efficient kernel design: minimize slow memory movement, maximize fast computation.
+The result: Flash Attention reduces HBM accesses from O(N²) to O(N), transforming a memory-bound operation into one that better utilizes the GPU's compute capabilities. This is the essence of efficient kernel design: *minimize slow memory movement, maximize fast computation* .
**Example: Validating our HBM3 Bandwidth in Practice**
@@ -4600,6 +4492,13 @@ You can install NVBandwidth from [NVIDIA's GitHub repository](https://github.com
Let's use it to measure our H100's local memory bandwidth using the `device_local_copy` test, which measures bandwidth of `cuMemcpyAsync` between device buffers local to the GPU across different message sizes.
+
+
+cuMemcpyAsync is a CUDA driver API function that asynchronously copies data between two memory pointers, inferring the type of transfer (host-to-host, host-to-device, device-to-device, or device-to-host)
+
+
+
+
```
$ ./nvbandwidth -t device_local_copy -b 2048
memcpy local GPU(column) bandwidth (GB/s)
@@ -4620,7 +4519,7 @@ memcpy local GPU(column) bandwidth (GB/s)
referenceLine: { value: 1975, label: "Max" },
xFormatAsFileSize: true
}}
- title="Device local copy bandwidth vs message size"
+ title="Measured H100 Local Memory Bandwidth"
/>
@@ -4642,7 +4541,9 @@ The *roofline model* provides a visual framework for understanding these perfo
Let's apply it to a real kernel analysis. It's available in the NSight Compute profiling tool we mentioned before (under "roofline analysis view"). Here's what we get:
-
+
@@ -4684,8 +4585,8 @@ This is where the external communication infrastructure becomes crucial. No matt
In this section, we'll look at four critical communication links that connect a GPU to the outside world:
- **GPU-CPU** : How the CPU schedules work and transfers data to GPUs
-- **GPU-GPU (same node)** : How GPUs on the same machine communicate
-- **GPU-GPU (different nodes)** : How GPUs across different machines communicate over the network
+- **GPU-GPU intra-node** : How GPUs on the same machine communicate
+- **GPU-GPU inter-node** : How GPUs across different machines communicate over the network
- **GPU-Storage** : How data flows from storage to GPU memory
Each of these links has different bandwidth and latency characteristics, and understanding them will help you identify where your training pipeline might be bottlenecked. To make this easier to understand, we've created a simplified diagram that highlights the most important components and communication links:
@@ -4744,16 +4645,16 @@ From the `lstopo` output, we can see two key PCIe bandwidth values in our syst
To have a better understanding of the whole topology, we can visualize it using:
```
-lstopo --whole-system lstopo-diagram.png
+$ lstopo --whole-system lstopo-diagram.png
```
-
+
This diagram showcases the hierarchical structure of our system:
-- It contains two NUMA (Non-Uniform Memory Access) nodes (NUMA is memory zone per CPU socket)
-- Each CPU socket connects to four PCIe switches via PCIe Gen4 x8 links (15.75GB/s)
-- Each PCIe switch connects to one H100 GPU via PCIe Gen5 x16 links (63.02GB/s)
+- It contains two **NUMA** (Non-Uniform Memory Access) nodes (NUMA is memory zone per CPU socket)
+- Each **CPU socket** connects to four **PCIe switches** via **PCIe Gen4** x8 links (15.75GB/s)
+- Each **PCIe switch** connects to one **H100 GPU** via **PCIe Gen5** x16 links (63.02GB/s)
- … (We'll explore the other components like NVSwitch, EFA network cards and NVMe drives in next sections.)
The PCIe specification differs between generations, each doubling the transfer rate per lane. Note that Transfer Rate is measured in GT/s (GigaTransfers per second), which represents the raw signaling rate, while Throughput is measured in GB/s (Gigabytes per second), which accounts for encoding overhead and represents the actual usable bandwidth:
@@ -4778,7 +4679,7 @@ config={{ initialFilter: 'cpu-gpu' }}
caption="CPU-to-GPU communication path."
/>
-From the topology diagram and the PCIe bandwidth table, we can see that the CPU-to-GPU path goes through two PCIe hops: first from the CPU to the PCIe switch via PCIe Gen4 x8 (15.754 GB/s), then from the PCIe switch to the GPU via PCIe Gen5 x16 (63.015 GB/s). *This means the bottleneck for CPU-GPU communication is the first hop at 15.754 GB/s* . Let's validate this with another util, `nvbandwidth` !
+From the topology diagram and the PCIe bandwidth table, we can see that the CPU-to-GPU path goes through two PCIe hops: first from the CPU to the PCIe switch via **PCIe Gen4** x8 (15.754 GB/s), then from the PCIe switch to the GPU via **PCIe Gen5** x16 (63.015 GB/s). *This means the bottleneck for CPU-GPU communication is the first hop at 15.754 GB/s* . Let's validate this with another util, `nvbandwidth` !
The `host_to_device_memcpy_ce` command measures bandwidth of `cuMemcpyAsync` from host (CPU) memory to device (GPU) memory using the GPU's copy engines.
@@ -4833,7 +4734,7 @@ caption="CPU-to-GPU latency measured with nvbandwidth's host_device_latency_sm t
/>
-The results show that latency is approximately **1.4 microseconds** . This explains the kernel launch overhead of a few microseconds we often observe in ML workloads. For workloads launching many small kernels, the added latency can become a bottleneck; otherwise, overhead is hidden by overlapping execution.
+The results show that **latency** is approximately **1.4 microseconds** . This explains the kernel launch overhead of a few microseconds we often observe in ML workloads. For workloads launching many small kernels, the added latency can become a bottleneck; otherwise, overhead is hidden by overlapping execution.
@@ -4863,7 +4764,7 @@ For more details, see the
-For comprehensive benchmarking scripts and configurations, see the excellent collection at [AWS Distributed Training Samples](https://github.com/aws-samples/awsome-distributed-training/tree/main/micro-benchmarks/nccl-tests). For a quick refresher on collective communication patterns, see the [UltraScale Playbook Appendix](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=ring_allreduce).
+For a quick refresher on collective communication patterns, see the [UltraScale Playbook Appendix](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=ring_allreduce).
```
@@ -5040,9 +4965,14 @@ $ ./all_reduce_perf -b 8 -e 16G -f 2 -g 1 -c 1 -n 100
+
+
+For comprehensive benchmarking scripts and configurations, see the excellent collection at [AWS Distributed Training Samples](https://github.com/aws-samples/awsome-distributed-training/tree/main/micro-benchmarks/nccl-tests).
+
+
But wait… We are achieving 480 GB/s, which exceeds the theoretical unidirectional bandwidth of 450 GB/s for NVLink 4.0 😮 What is this sorcery, and how is this possible?
-Diving a bit in the docs, it seems like the answer lies in NVLink SHARP (NVLS), NVIDIA's hardware-accelerated collective operations technology. They provide approximately 1.3x speedup for allreduce operations on a single node with H100 GPUs!
+Diving a bit in the docs, it seems like the answer lies in **NVLink SHARP (NVLS)** , NVIDIA's hardware-accelerated collective operations technology. They provide approximately 1.3x speedup for allreduce operations on a single node with H100 GPUs!
@@ -5060,6 +4990,11 @@ $ ./all_to_all_perf -b 8 -e 16G -f 2 -g 1 -c 1 -n 100
We achieve **340 GB/s** for alltoall operations, which aligns with published benchmarks showing similar performance characteristics for H100 systems with NVLink 4.0 ([source](https://juser.fz-juelich.de/record/1019178/files/02-NCCL_NVSHMEM.pdf#page=20.00)). Unlike allreduce, alltoall operations don't benefit from NVLS hardware acceleration, which explains why we see 340 GB/s here compared to the 480 GB/s achieved with allreduce. The alltoall pattern requires more complex point-to-point data exchanges between all GPU pairs, relying purely on NVLink's base bandwidth rather than NVSwitch's collective acceleration features.
+
+
+For custom NVLink communication patterns, keep an eye on PyTorch's [SymmetricMemory API](https://dev-discuss.pytorch.org/t/pytorch-symmetricmemory-harnessing-nvlink-programmability-with-ease/2798), which enables fine-grained control over NVLink and NVLS operations.
+
+
Some optimized kernels separate NVLink communication from compute by assigning dedicated warps to handle transfers. For example, ThunderKittens uses a warp-level design where specific warps issue NVLink transfers and wait for completion, while other warps continue compute operations. This fine-grained overlap of SM compute and NVLink communication can hide most inter-GPU communication latency. For implementation details, see the ThunderKittens blog post on multi-GPU kernels.
@@ -5102,8 +5037,8 @@ For AWS p5 instances we have [Elastic Fabric Adapter (](https://docs.aws.amazon.
-As illustrated above, when GPUs and network cards are connected to the same PCIe switch, GPUDirect RDMA enables their communication to occur solely through that switch. This setup allows for full utilization of the PCIe Gen5 x16 bandwidth and avoids involving other PCIe switches or the CPU memory bus.
-Theoretically, 8 PCIe Switches per node x 4 EFA NICs per switch x 100 Gbps each EFA NIC gives 3200 Gbps (400GB/s) of bandwidth which is the bandwidth we find in [AWS p5's specs).](https://aws.amazon.com/ec2/instance-types/p5/) So how does it hold in practice? Let's find out by running the same benchmarks as before but across different nodes!
+As illustrated above, when GPUs and network cards are connected to the same PCIe switch, **GPUDirect RDMA** enables their communication to occur solely through that switch. This setup allows for full utilization of the PCIe Gen5 x16 bandwidth and avoids involving other PCIe switches or the CPU memory bus.
+Theoretically, 8 PCIe Switches per node x 4 EFA NICs per switch x 100 Gbps each EFA NIC gives **3200 Gbps**(400GB/s)** of bandwidth which is the bandwidth we find in [AWS p5's specs).](https://aws.amazon.com/ec2/instance-types/p5/) So how does it hold in practice? Let's find out by running the same benchmarks as before but across different nodes!
**Bandwidth Analysis**
@@ -5118,7 +5053,7 @@ aws-samples/awsome-distributed-training."/>
Point-to-point send/receive operations achieve around **42-43 GB/s** for 2-4 nodes but drop to approximately 21 GB/s for 5+ nodes. This performance degradation occurs because NCCL automatically reduces the number of point-to-point channels per peer from 2 to 1 when scaling beyond 4 nodes, effectively halving the available bandwidth utilization, while theoretical maximum remains ~50 GB/s (4 EFA NICs × 12.5 GB/s each). We successfully managed to restore the full throughput for this test on 5+ nodes by setting `NCCL_NCHANNELS_PER_NET_PEER=2` , though this flag should be used with caution as it may degrade all-to-all performance for example (see [GitHub issue #1272](https://github.com/NVIDIA/nccl/issues/1272) for details).
-The all-reduce operation demonstrates excellent scaling within a single node, achieving **480 GB/s** of bus bandwidth. When scaling to 2 nodes, bandwidth remains nearly identical at 479 GB/s, after which it stabilizes at around 320-350 GB/s for 3-16 nodes. This pattern reveals an important characteristic: while there's an initial drop when crossing node boundaries due to the transition from NVLink to the inter-node network fabric, the bandwidth then scales almost constantly as we add more nodes.
+The all-reduce operation demonstrates excellent performance within a single node, achieving **480 GB/s** of bus bandwidth. When scaling to 2 nodes, bandwidth remains nearly identical at 479 GB/s, after which it stabilizes at around 320-350 GB/s for 3-16 nodes. This pattern reveals an important characteristic: while there's an initial drop when crossing node boundaries due to the transition from NVLink to the inter-node network fabric, *the bandwidth then scales almost constantly as we add more nodes.*
@@ -5143,11 +5078,11 @@ The all-to-all operation shows more dramatic scaling challenges: starting at 344
-Latency measurements reveal the fundamental cost of crossing node boundaries. Send/receive operations maintain relatively stable latencies of 40-53 μs across all multi-node configurations, demonstrating that point-to-point communication latency is primarily determined by the base network round-trip time rather than cluster size, though some variation suggests network topology and routing effects still play a role.
+ **Latency** measurements reveal the fundamental cost of crossing node boundaries. Send/receive operations maintain relatively stable latencies of **40-53 μs** across all multi-node configurations, demonstrating that point-to-point communication latency is primarily determined by the base network round-trip time rather than cluster size, though some variation suggests network topology and routing effects still play a role.
-All-reduce operations show minimal latency of 12.9 μs within a single node, but this jumps to 55.5 μs for 2 nodes and continues increasing nearly linearly with cluster size, reaching 235 μs at 16 nodes. This progression reflects both the increased communication distance and the growing complexity of the reduction tree across more nodes.
+All-reduce operations show minimal latency of **12.9 μs** within a single node, but this jumps to **55.5 μs** for 2 nodes and continues increasing nearly linearly with cluster size, reaching **235 μs** at 16 nodes. This progression reflects both the increased communication distance and the growing complexity of the reduction tree across more nodes.
-All-to-all operations exhibit similar trends, starting at 7.6 μs for single-node communication but climbing to 60 μs at 2 nodes and reaching 621 μs at 16 nodes. The superlinear growth in latency for all-to-all operations indicates that network congestion and coordination overhead compound as more nodes participate in the collective.
+All-to-all operations exhibit similar trends, starting at **7.6 μs** for single-node communication but climbing to **60 μs** at 2 nodes and reaching **621** μs at 16 nodes. The superlinear growth in latency for all-to-all operations indicates that network congestion and coordination overhead compound as more nodes participate in the collective.
@@ -5186,11 +5121,11 @@ Improper CPU affinity settings can significantly impact NCCL performance by caus
Understanding your network topology is crucial for diagnosing performance issues. Cloud placement groups, while helpful, don't guarantee minimal network hops between instances. In modern datacenter fat-tree topologies, instances placed under different top-level switches will experience higher latency and potentially lower bandwidth due to additional network hops in the routing path.
-For AWS EC2 users, the [Instance Topology API](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/how-ec2-instance-topology-works.html) provides valuable visibility into network node placement. Instances sharing t"he same network node at the bottom layer (directly connected to the instance) are physically closest and will achieve the lowest latency communication.
+For **AWS EC2** users, the [Instance Topology API](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/how-ec2-instance-topology-works.html) provides valuable visibility into network node placement. Instances sharing t"he same network node at the bottom layer (directly connected to the instance) are physically closest and will achieve the lowest latency communication.
-
+
```mermaid
graph TD
@@ -5290,7 +5225,7 @@ When using containers (Docker/Enroot), several configuration steps are critical
-We're gathering troubleshooting findings here as a community effort. If you've encountered performance issues or discovered effective debugging methods, please jump to the Discussion Tab and share your experience to help others optimize their interconnect utilization.
+We're gathering troubleshooting findings here as a community effort. If you've encountered performance issues or discovered effective debugging methods, please jump to the [Discussion Tab](https://huggingface.co/spaces/HuggingFaceTB/smol-training-playbook/discussions) and share your experience to help others optimize their interconnect utilization.
Now that you know how to debug bottlenecks in GPU-CPU and GPU-GPU communication let's have a look at a GPU communication part that typically gets less attention, namely communication with the storage layer!
@@ -5367,12 +5302,19 @@ $ lsblk --fs -M
```
This output shows the block device hierarchy on the system. The key observations:
-- `nvme0n1p1` is the root filesystem mounted at `/` , using 35% of its ~190GB capacity
+- `nvme0n1p1` is the root [Amazon EBS](https://aws.amazon.com/ebs/) filesystem mounted at `/` , using 35% of its full ~300GB capacity
- Eight NVMe drives ( `nvme1n1` through `nvme8n1` ) are configured as a RAID array named `MY_RAID`
- The RAID array is exposed as `/dev/md0` , formatted with XFS, and mounted at `/scratch` with 28TB available (8x3.5TB)
The arrows (┈▶) indicate that multiple NVMe devices are members of the same RAID array, which then combines into the single `md0` device.
+
+
+[Amazon Elastic Block Store (EBS)](https://aws.amazon.com/ebs/) is a high-performance, scalable block storage service designed for use with Amazon EC2 instances.
+
+
+
+
**Network Storage**
In addition to local NVMe storage, the system has access to network-attached storage systems:
@@ -5387,10 +5329,15 @@ weka-hopper.hpc.internal.huggingface.tech/default 393T 263T 131T 67% /fsx
```
This output shows:
-- `/dev/root` (291GB) is the root filesystem at 35% capacity
+- `/dev/root` (291GB [Amazon EBS](https://aws.amazon.com/ebs/)) is the root filesystem at 35% capacity
- `/fsx` (393TB WekaFS) is 67% full with 131TB available
-- `/admin` (4.5TB Lustre) is 63% full with 1.7TB available
-- `/dev/md0` (28TB local RAID) is only 1% full with 28TB available at `/scratch` . This is our 8×3.5TB NVMe instance store drives in RAID.
+- `/admin` (4.5TB FSx Lustre) is 63% full with 1.7TB available
+- `/dev/md0` (28TB local NVMe RAID) is only 1% full with 28TB available at `/scratch` . This is our 8×3.5TB SSD NVMe instance store drives in RAID.
+
+
+
+Note that `/fsx` isn't actually Amazon FSx, but WekaFS. We kept the same mount point name for convenience when we migrated from FSx to WekaFS.
+
The local NVMe RAID array ( `/scratch` ) provides the fastest I/O performance, while the network filesystems offer larger capacity for shared data storage.
@@ -5412,7 +5359,7 @@ The local NVMe RAID array ( `/scratch` ) provides the fastest I/O performance, w
nodes.
- Lustre: A parallel file system designed for HPC that separates
+ FSx Lustre: A parallel file system designed for HPC that separates
metadata and data services across different servers to enable parallel
access. While effective for large files, it can struggle with
metadata-intensive AI/ML workloads involving many small files.
@@ -5461,24 +5408,21 @@ The benchmark evaluates storage system performance across Throughput, Latency, I
-
-
-
The benchmarks reveal dramatic performance differences across our four storage systems:
- **Scratch (Local NVMe RAID)** dominates with **26.59 GiB/s** throughput and **337K IOPS** , making it 6.3× faster than FSx for throughput and 6.6× better for IOPS. This local RAID array of 8×3.5TB NVMe drives delivers the lowest latency (190μs at peak IOPS) and scales exceptionally well with thread count, achieving peak performance at 64 threads with 1M I/O sizes for throughput.
+ **/scratch (Local NVMe RAID)** dominates with **26.59 GiB/s** throughput and **337K IOPS** , making it 6.3× faster than FSx for throughput and 6.6× better for IOPS. This local RAID array of 8×3.5TB NVMe drives delivers the lowest latency (190μs at peak IOPS) and scales exceptionally well with thread count, achieving peak performance at 64 threads with 1M I/O sizes for throughput.
- **FSx (WekaFS)** provides solid network storage performance at **4.21 GiB/s** and **51K IOPS** , making it the best choice for shared data that needs reasonable performance. FSx achieves its best throughput (4.21 GiB/s) using CPUONLY transfer, while its best IOPS (51K) uses GPUD transfer type.
+ **/fsx (WekaFS)** provides solid network storage performance at **4.21 GiB/s** and **51K IOPS** , making it the best choice for shared data that needs reasonable performance. FSx achieves its best throughput (4.21 GiB/s) using CPUONLY transfer, while its best IOPS (51K) uses GPUD transfer type.
- **Admin (Lustre)** and **Root** filesystems show similar modest performance around **1.1 GiB/s** throughput, but differ significantly in IOPS capability. Admin achieves its peak throughput (1.13 GiB/s) with GPUD transfer and peaks at 17K IOPS with CPU_GPU transfer (24× better than Root), making it more suitable for workloads with many small operations. Root's poor IOPS performance (730) confirms it's best suited for large sequential operations only.
+ **/admin (FSx Lustre)** and **/root (EBS)** filesystems show similar modest performance around **1.1 GiB/s** throughput, but differ significantly in IOPS capability. Admin achieves its peak throughput (1.13 GiB/s) with GPUD transfer and peaks at 17K IOPS with CPU_GPU transfer (24× better than Root), making it more suitable for workloads with many small operations. Root's poor IOPS performance (730) confirms it's best suited for large sequential operations only.
-**Note on GPU_DIRECT Results:** GPUDirect Storage (GDS) is not currently enabled in our cluster, which explains why GPUD results for NVMe storage (Scratch and Root) underperform compared to CPUONLY transfers. With GDS properly configured, we would expect GPUD to show significant advantages for direct GPU-to-storage transfers, particularly for the high-performance NVMe arrays. The FSx and Admin results show better GPUD performance because WekaFS and Lustre have different optimization paths that don't rely as heavily on the GDS kernel module.
+Note on GPU_DIRECT Results: GPUDirect Storage (GDS) is not currently enabled in our cluster, which explains why GPUD results for NVMe storage (Scratch and Root) underperform compared to CPUONLY transfers. With GDS properly configured, we would expect GPUD to show significant advantages for direct GPU-to-storage transfers, particularly for the high-performance NVMe arrays.
**Optimal Configuration Patterns** : Across all storage types, maximum throughput occurs at 1M I/O sizes, while maximum IOPS occurs at the smallest tested size (64K). This classic tradeoff means choosing between raw bandwidth (large I/O) and operation concurrency (small I/O) based on workload characteristics. For ML training with large checkpoint files, the 1M-8M range on Scratch provides optimal performance.
@@ -5600,7 +5544,7 @@ Thermal throttling doesn't just hurt the affected GPU; it cascades across your e
-The chart above shows AllReduce bandwidth degrading as we scale from 1 to 16 nodes. Notice the sharp drop after 14 nodes, from 350 GB/s to 100 GB/s while we expect the bandwidth to stay above 300GB/s as we've seen before. This wasn't a network issue: a single node with thermal throttling became the bottleneck, forcing all other nodes to wait during gradient synchronization. In distributed training, you're only as fast as your slowest node.
+The chart above shows AllReduce bandwidth degrading as we scale from 1 to 16 nodes. Notice the sharp drop after 14 nodes, from **350 GB/s** to **100 GB/s** while we expect the bandwidth to stay above 300GB/s as we've seen before. This wasn't a network issue: a single node with thermal throttling became the bottleneck, forcing all other nodes to wait during gradient synchronization. In distributed training, you're only as fast as your slowest node.
👉 **Key lesson:** Before committing to long training runs, stress-test your hardware using the tools mentioned earlier to identify thermal and power limitations. Monitor temperatures continuously using DCGM telemetry and plan for real-world thermal limits. It's also good practice to verify that GPU clocks are set to maximum performance. For a deeper dive into why GPUs can't sustain their advertised performance due to power constraints, see [this excellent analysis on power throttling](https://www.thonking.ai/p/strangely-matrix-multiplications).
@@ -5723,25 +5667,18 @@ We follow the [ **Ultra-Scale Playbook** ](https://huggingface.co/spaces/nanotro
-For detailed explanations of different parallelism strategies (Data Parallelism, Tensor Parallelism, Pipeline Parallelism, ZeRO, etc.), we urge you once again to checkout the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook) *inserts Bernie meme*
+For detailed explanations of different parallelism strategies (Data Parallelism, Tensor Parallelism, Pipeline Parallelism, ZeRO, etc.), we urge you once again to checkout the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook) \*inserts Bernie meme\*
#### Step 1: Fitting a training step in memory
-The first question is simple: does our SmolLM3 3B model even fit in a single H100's 80GB of memory? To answer this, we use nanotron's `predict_memory` tool, which estimates memory consumption for model parameters, optimizer states, gradients, and activations.
+The first question is simple: does our SmolLM3 3B model even fit in a single H100's 80GB of memory? To answer this, we use [nanotron's ](https://huggingface.co/spaces/nanotron/predict_memory)[ `predict_memory` ](https://huggingface.co/spaces/nanotron/predict_memory)[ tool](https://huggingface.co/spaces/nanotron/predict_memory), which estimates memory consumption for model parameters, optimizer states, gradients, and activations.
-
-
-
-
+
@@ -5753,7 +5690,7 @@ The results show we're pushing close to the 80GB limit. This means we need some
Now that we know the model fits in memory with some form of parallelism, we need to determine how to achieve our target global batch size (GBS) of approximately 2 million tokens. This constraint gives us our first equation:
$$
-GBS = DP × MBS × GRAD_ACC × SEQLEN ≈ 2M tokens
+\text{GBS} = \text{DP} \times \text{MBS} \times \text{GRAD\_ACC} \times \text{SEQLEN} \approx 2\text{M tokens}
$$
Where:
@@ -5766,7 +5703,7 @@ Where:
We also have a hardware constraint from our 384 H100s:
$$
-DP × TP × PP = 384 = 2^7 × 3
+\text{DP} \times \text{TP} \times \text{PP} = 384 = 2^7 \times 3
$$
Where:
@@ -5834,7 +5771,7 @@ Now go train something. And when your loss spikes mysteriously at 2am, remember:
#### **Acknowledgments**
-We thank [Guilherme](https://huggingface.co/guipenedo) and [Hugo](https://huggingface.co/hlarcher) for their valuable feedback, and [Abubakar](https://huggingface.co/abidlabs) for his help with Trackio features.
+We thank [Guilherme](https://huggingface.co/guipenedo), [Hugo](https://huggingface.co/hlarcher) and [Mario](https://huggingface.co/mariolr) for their valuable feedback, and [Abubakar](https://huggingface.co/abidlabs) for his help with Trackio features.
## References