【Megatron-LM】环境配置与训练示例跑通

环境配置

  1. 下载代码:
1
git clone https://github.com/NVIDIA/Megatron-LM.git
  • 切换到稳定分支:
1
git checkout -b core_r0.14.0 origin/core_r0.14.0
  • 拉取指定docker镜像:
1
docker pull nvcr.io/nvidia/pytorch:25.04-py3

最小示例

示例运行流程

  1. 进入Docker容器,将本地的Megatron-LM挂载到宿主机的/workspace/megatron-lm上注意这里只将GPU0和GPU1引入进容器里,并且使用的是16GB的共享内存。注意将/home/xxx/Megatron-LM换成自己的本地路径。
1
docker run --gpus '"device=0,1"' -it --rm --shm-size=16G   -v /home/xxx/Megatron-LM:/workspace/megatron-lm   -e PIP_CONSTRAINT=   nvcr.io/nvidia/pytorch:25.04-py3
  • 进入megatron-lm目录,运行最小示例:
1
2
cd megatron-lm/
PYTHONPATH=$PYTHONPATH:./megatron torchrun --nproc-per-node 2 examples/run_simple_mcore_train_loop.py
  • 查看运行结果,虽然会有一些warning,但是还是能正常运行结束的。

最小示例解读

下面解读一下这最小示例examples/run_simple_mcore_train_loop.py的代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import os
import torch
from torch.optim import Adam
from torch.utils.data import DataLoader
from functools import partial
from pathlib import Path

from megatron.core import parallel_state
from megatron.core import dist_checkpointing
from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.models.gpt.gpt_model import GPTModel
from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
from megatron.core.datasets.utils import compile_helpers
from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
from megatron.core.datasets.gpt_dataset import GPTDatasetConfig, MockGPTDataset
from megatron.training.tokenizer.tokenizer import _NullTokenizer

_SEQUENCE_LENGTH = 64

def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
parallel_state.destroy_model_parallel()

# Torch setup for distributed training
rank = int(os.environ['LOCAL_RANK'])
world_size = torch.cuda.device_count()
torch.cuda.set_device(rank)
torch.distributed.init_process_group(world_size=world_size, rank=rank)

# Megatron core distributed training initialization
parallel_state.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size)

def model_provider():
"""Build the model."""

transformer_config = TransformerConfig(
num_layers=2,
hidden_size=12,
num_attention_heads=4,
use_cpu_initialization=True,
pipeline_dtype=torch.float32,
)

gpt_model = GPTModel(
config=transformer_config,
transformer_layer_spec=get_gpt_layer_local_spec(),
vocab_size=100,
max_sequence_length=_SEQUENCE_LENGTH,
)

return gpt_model

def get_train_data_iterator():
if torch.distributed.is_available() and torch.distributed.is_initialized():
if torch.distributed.get_rank() == 0:
compile_helpers()
torch.distributed.barrier()
else:
compile_helpers()

config = GPTDatasetConfig(
random_seed=0,
sequence_length=_SEQUENCE_LENGTH,
reset_position_ids=False,
reset_attention_mask=False,
eod_mask_loss=False,
tokenizer=_NullTokenizer(vocab_size=_SEQUENCE_LENGTH),
mid_level_dataset_surplus=0.005,
)

datasets = BlendedMegatronDatasetBuilder(
MockGPTDataset, [1000, None, None], lambda: True, config
).build()

train_dataloader = DataLoader(datasets[0], batch_size=8, shuffle=True)

train_iterator = iter(train_dataloader)

return train_iterator

def forward_step_func(data_iterator, model):

def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):

losses = output_tensor.float()
loss_mask = loss_mask.view(-1).float()
loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
# If you have data parallel reduce loss across data parallel groups.
# If pipeline parallel, loss computation is done only in last stage.

return loss, {'lm loss': loss}

data = next(data_iterator)
tokens = data['tokens'].to(device)
attention_mask = data['attention_mask'].to(device)
position_ids = data['position_ids'].to(device)
labels = data['labels'].to(device)
loss_mask = data['loss_mask'].to(device)

output_tensor = model(tokens, position_ids, attention_mask,
labels=labels)

return output_tensor, partial(loss_func, loss_mask)

def save_distributed_checkpoint(checkpoint_path, gpt_model):
sharded_state_dict = gpt_model.sharded_state_dict(prefix='')
dist_checkpointing.save(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)

def load_distributed_checkpoint(checkpoint_path, gpt_model):
sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
gpt_model.load_state_dict(checkpoint)
return gpt_model

if __name__ == "__main__":
initialize_distributed(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
model_parallel_cuda_manual_seed(123)

gpt_model = model_provider()
device = torch.device("cuda")
gpt_model.to(device)

optim = Adam(gpt_model.parameters())

train_iterator = get_train_data_iterator()

forward_backward_func = get_forward_backward_func()

# Running the model for 5 iterations
for _ in range(5):
optim.zero_grad()

losses_reduced = forward_backward_func(
forward_step_func=forward_step_func,
data_iterator=train_iterator,
model=gpt_model,
num_microbatches=1,
seq_length=_SEQUENCE_LENGTH,
micro_batch_size=8,
decoder_seq_length=_SEQUENCE_LENGTH,
forward_only=False)

optim.step()

print(f'Losses reduced : {losses_reduced}')

# Saving the model
ckpt_path = os.getcwd() + '/ckpt'
Path(ckpt_path).mkdir(exist_ok=True)
save_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)

# Loading the model
gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
gpt_model.to(device)
print('Successfully loaded the model')

初始化阶段

  • 销毁之前的模型并行状态

  • 获取本地rank和world_size(GPU数量)

  • 设置当前进程使用的CUDA设备

  • 初始化PyTorch分布式进程组

  • 初始化Megatron-Core的模型并行:张量并行度=2,流水线并行度=1,这是稍微复杂一些的地方,其主要作用为:

    • 依据输入的并行参数以及默认参数计算各种关键并行参数,并进行参数检查,例如检查如下的内容:

      1
      2
      3
      4
      model_size = tensor_model_parallel_size * pipeline_model_parallel_size * context_parallel_size

      if world_size % model_size != 0:
      raise RuntimeError(f"world_size ({world_size}) is not divisible by {model_size}")
    • 其还创建了各种dp、tp、pp、ep、cp的通信组torch.distributed.new_group,并通过全局变量进行传递。(怎么这么多全局变量啊。。。)

模型构建

  • 构建了TransformerConfig,其层数为2,隐藏层维度为12,注意力头数量为4

  • 然后依据TransformerConfig以及其他参数(词表大小100,最大序列长度64)构建了GPT模型

优化器构建

  • 使用gpt_model.parameters()来初始化了torch原生的Adam优化器

数据准备

  • 创建数据集配置:

    • 使用_NullTokenizer(虚拟分词器)

    • 序列长度64

    • 不重置position_ids和attention_mask

  • 构建混合数据集:

    • 使用MockGPTDataset(模拟数据集,用于测试)

    • 1000个样本

  • 创建torch原生的DataLoader,并用迭代器进行包装:

    • batch_size=8

    • shuffle=True

训练循环

  • 获取forward_backward_func

    • 该函数提供了如何根据data_iterator获取数据,然后使用模型计算出output_tensor,并最终返回了output_tensor, partial(loss_func, loss_mask)
  • 训练了5个迭代

模型保存与加载

  • 调用gpt_model.sharded_state_dict()获取分片状态字典

  • 使用dist_checkpointing.save()保存

    • 每个GPU只保存自己的模型分片

    • 支持张量并行、流水线并行的自动分片

GPT训练示例

分词器准备

  • 直接使用GPT-2 的 tokenizer,下载方式如下:
1
2
3
mkdir -p data/tokenizer
wget https://huggingface.co/gpt2/resolve/main/vocab.json -O data/tokenizer/gpt2-vocab.json
wget https://huggingface.co/gpt2/resolve/main/merges.txt -O data/tokenizer/gpt2-merges.txt

vocab是每个token对应的序号,merges表示如何两两token合并为一个大token

训练数据准备

1
wget -p data https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStoriesV2-GPT4-train.txt?download=true
  • 由于Megatron-LM处理的数据集基本格式是jsonl,所以这里使用了一个脚本txt_convert_to_jsonl.py进行格式转化,如下,在容器中运行该脚本后会得到data/TinyStoriesV2-GPT4-valid.jsonl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import json
from tqdm import tqdm

in_file = "/workspace/megatron-lm/data/TinyStoriesV2-GPT4-train.txt"
out_file = "/workspace/megatron-lm/data/TinyStoriesV2-GPT4-train.jsonl"

# 先统计总行数(用于进度条)
with open(in_file, "r", encoding="utf-8") as f:
total_lines = sum(1 for _ in f)

with open(in_file, "r", encoding="utf-8") as fin, \
open(out_file, "w", encoding="utf-8") as fout:

for line in tqdm(fin, total=total_lines, desc="Converting"):
line = line.strip()
if line:
fout.write(json.dumps({"text": line}, ensure_ascii=False) + "\n")

  • 然后还需要用Megatron-LM的脚本借助分词器进一步转化为bin+idx的格式。运行脚本如下,最终会得到data/TinyStoriesV2-GPT4-train_text_document.bindata/TinyStoriesV2-GPT4-train_text_document.idx
1
2
3
4
5
6
7
8
python tools/preprocess_data.py \
--input /workspace/megatron-lm/data/TinyStoriesV2-GPT4-train.jsonl \
--output-prefix /workspace/megatron-lm/data/TinyStoriesV2 \
--vocab-file /workspace/megatron-lm/data/tokenizer/gpt2-vocab.json \
--merge-file /workspace/megatron-lm/data/tokenizer/gpt2-merges.txt \
--tokenizer-type GPT2BPETokenizer \
--append-eod \
--workers 8

857m GPT3模型训练脚本

examples/gpt3下有175b的GPT模型训练的脚本train_gpt3_175b_distributed.sh,不过可惜我只有4块4090,所以我稍微修改了一下脚本,改为训练其给出的857m的模型,该训练脚本train_gpt3_857m_distributed.sh如下所示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/bin/bash

# Runs the "857m" parameter model

export CUDA_DEVICE_MAX_CONNECTIONS=1

GPUS_PER_NODE=4
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NUM_NODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

CHECKPOINT_PATH=$1 #<Specify path>
TENSORBOARD_LOGS_PATH=$2 #<Specify path>
VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
DATA_PATH=$5 #<Specify path and file prefix>_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)

GPT_MODEL_ARGS=(
--num-layers 24
--hidden-size 1024
--num-attention-heads 16
--seq-length 2048
--max-position-embeddings 2048
--attention-backend auto # Can use (flash/fused/unfused/local)
)

TRAINING_ARGS=(
--micro-batch-size 1
--global-batch-size 16
# --rampup-batch-size 16 16 5859375
--train-iters 10000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--fp16
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
)

MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 2
)

DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-interval 100
--save-interval 10000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)

torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]}

改变的内容有:

  • 在GPT模型结构上,修改为:

    • num-layers 24

    • hidden-size 1024

    • num-attention-heads 16

  • 为了避免显存不足,训练步数train-iters改为10000,global-batch-size改为16,取消rampup-batch-size

  • 并行训练策略改为TP度为2,PP度也为2

注意这些只是为了能跑起来写的,参数没有得到很好的设计

857m GPT3模型训练

  • 首先进入容器内,这次将本地全部4块GPU都挂载进去:
1
docker run --gpus all -it --rm --shm-size=128G   -v /home/ljw/Megatron-LM:/workspace/megatron-lm   -e PIP_CONSTRAINT=   nvcr.io/nvidia/pytorch:25.04-py3
  • 创建tb_log、checkpoint需要的文件夹
1
2
mkdir -p /workspace/megatron-lm/model_ckpt/gpt3_857m
mkdir -p /workspace/megatron-lm/tb_logs/gpt3_857m
  • 直接执行下面这个脚本训练即可:
1
2
3
4
5
6
bash examples/gpt3/train_gpt3_857m_distributed.sh \
/workspace/megatron-lm/model_ckpt/gpt3_857m \
/workspace/megatron-lm/tb_logs/gpt3_857m \
/workspace/megatron-lm/data/tokenizer/gpt2-vocab.json \
/workspace/megatron-lm/data/tokenizer/gpt2-merges.txt \
/workspace/megatron-lm/data/TinyStoriesV2-GPT4-train_text_document
  • 运行截图如下所示:

  • nvtop查看GPU占用情况如下图所示


【Megatron-LM】环境配置与训练示例跑通
http://example.com/2025/12/14/megatron-lm-env-and-example/
作者
滑滑蛋
发布于
2025年12月14日
许可协议