Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Update
  • Loading branch information
From00 committed Jan 29, 2025
commit 925210d78d2b54512d0ac3fff4a12059b8534e00
9 changes: 8 additions & 1 deletion llm/auto_parallel/llama/run_pretrain_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@
check_data_split,
print_rank_0,
)
from paddlenlp.utils.tools import get_env_device
from paddlenlp.trainer.utils.doc import add_start_docstrings
from paddlenlp.utils.tools import get_env_device


@dataclass
Expand Down Expand Up @@ -174,6 +174,11 @@ class ModelArguments:
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)

use_fast_layer_norm: bool = field(
default=False,
metadata={"help": "GPT3 model, use fast layernorm"},
)

config_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
)
Expand Down Expand Up @@ -497,6 +502,8 @@ def main():

config = config_class.from_pretrained(model_args.model_name_or_path)

config.use_fast_layer_norm = model_args.use_fast_layer_norm

config.seq_length = data_args.max_seq_length
# There are some technique extend RotaryEmbedding context. so don't change max_position_embeddings
if not model_args.continue_training:
Expand Down
5 changes: 5 additions & 0 deletions paddlenlp/trainer/auto_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from tqdm.auto import tqdm

from paddlenlp.trainer import Trainer
from paddlenlp.utils.tools import get_env_device

Check warning on line 29 in paddlenlp/trainer/auto_trainer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/auto_trainer.py#L29

Added line #L29 was not covered by tests

from ..utils.batch_sampler import DistributedBatchSampler as NlpDistributedBatchSampler
from ..utils.log import logger
Expand Down Expand Up @@ -522,6 +523,10 @@

logger.info("\nTraining completed. \n")

# Hack for XPU that doesn't support Allgather yet. See LlamaPretrainingCriterion3DAuto in modeling_auto.py for details.
if get_env_device() == "xpu":
tr_loss = tr_loss.mean()

Check warning on line 528 in paddlenlp/trainer/auto_trainer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/auto_trainer.py#L527-L528

Added lines #L527 - L528 were not covered by tests

self._total_loss_scalar += self._get_item_from_loss(tr_loss)
train_loss = self._total_loss_scalar / self.state.global_step

Expand Down
46 changes: 38 additions & 8 deletions paddlenlp/transformers/llama/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model
from paddlenlp.utils.tools import get_env_device

from . import fusion_ops
from .configuration import (
LLAMA_PRETRAINED_INIT_CONFIGURATION,
LLAMA_PRETRAINED_RESOURCE_FILES_MAP,
Expand All @@ -70,7 +71,6 @@
build_alibi_tensor,
get_triangle_upper_mask,
repeat_kv,
rms_norm_fused,
)

try:
Expand Down Expand Up @@ -195,10 +195,6 @@
return (attn_output, attn_weights) if output_attentions else attn_output


colwise_placements = [dist.Replicate(), dist.Shard(1)]
rowise_placement = [dist.Replicate(), dist.Shard(0)]


class LlamaRMSNormAuto(nn.Layer):
def __init__(self, config, ipp):
super().__init__()
Expand All @@ -219,7 +215,9 @@

def forward(self, hidden_states):
if self.config.use_fused_rms_norm:
return rms_norm_fused(hidden_states, self.weight, self.variance_epsilon)
return fusion_ops.fusion_rms_norm(

Check warning on line 218 in paddlenlp/transformers/llama/modeling_auto.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/llama/modeling_auto.py#L218

Added line #L218 was not covered by tests
hidden_states, self.weight, self.variance_epsilon, self.config.use_fast_layer_norm
)

with paddle.amp.auto_cast(False):
variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True)
Expand All @@ -239,6 +237,16 @@
self.fuse_attention_ffn = config.fuse_attention_ffn
self.ipp = ipp
self.config = config
colwise_placements = (

Check warning on line 240 in paddlenlp/transformers/llama/modeling_auto.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/llama/modeling_auto.py#L240

Added line #L240 was not covered by tests
[dist.Replicate(), dist.Shard(1)]
if self.config.tensor_parallel_degree > 1
else [dist.Replicate(), dist.Replicate()]
)
rowise_placement = (

Check warning on line 245 in paddlenlp/transformers/llama/modeling_auto.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/llama/modeling_auto.py#L245

Added line #L245 was not covered by tests
[dist.Replicate(), dist.Shard(0)]
if self.config.tensor_parallel_degree > 1
else [dist.Replicate(), dist.Replicate()]
)

if config.fuse_attention_ffn and not enable_fuse_ffn_qkv_pass():
self.gate_up_fused_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias_attr=False)
Expand Down Expand Up @@ -308,8 +316,19 @@
self.recompute_granularity = config.recompute_granularity
self.ipp = ipp

colwise_placements = (

Check warning on line 319 in paddlenlp/transformers/llama/modeling_auto.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/llama/modeling_auto.py#L319

Added line #L319 was not covered by tests
[dist.Replicate(), dist.Shard(1)]
if self.config.tensor_parallel_degree > 1
else [dist.Replicate(), dist.Replicate()]
)
rowise_placement = (

Check warning on line 324 in paddlenlp/transformers/llama/modeling_auto.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/llama/modeling_auto.py#L324

Added line #L324 was not covered by tests
[dist.Replicate(), dist.Shard(0)]
if self.config.tensor_parallel_degree > 1
else [dist.Replicate(), dist.Replicate()]
)

self.use_fused_rope = config.use_fused_rope
if self.use_fused_rope and get_env_device() not in ["npu", "mlu", "xpu", "gcu", "intel_hpu"]:

Check warning on line 331 in paddlenlp/transformers/llama/modeling_auto.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/llama/modeling_auto.py#L331

Added line #L331 was not covered by tests
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

自动并行 modeling_network.py 和 modeling_auto.py的关系是什么,modeling_network.py需要同步修改吗?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

modeling_network.py是中层API实现,modeling_auto.py是基础API实现,这两套理想状态下应该是合成一份的。这个PR只是昆仑适配的第一步,目前只支持动半纯dp的场景,后边还需要继续测试和迭代,等这块完善之后,后续应该会有专门的线条将 modeling_network.py和modeling_auto.py做合并。

if "gpu" not in paddle.device.get_device() or fused_rotary_position_embedding is None:
warnings.warn(
"Enable fuse rope in the config, but fuse rope is not available. "
Expand Down Expand Up @@ -936,22 +955,22 @@
else:
expanded_attn_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
# Convert bool attention_mask to float attention mask, which will be added to attention_scores later
if get_env_device() in ["npu", "mlu", "intel_hpu"]:
x = paddle.to_tensor(0.0, dtype="float32")
y = paddle.to_tensor(paddle.finfo(dtype).min, dtype="float32")
expanded_attn_mask = paddle.where(expanded_attn_mask.cast("bool"), x, y).astype(dtype)
elif get_env_device() == "xpu":
x = paddle.to_tensor(0.0, dtype="float32")
y = paddle.to_tensor(-1.7005809656952787e38, dtype="float32")
expanded_attn_mask = paddle.where(expanded_attn_mask.cast("bool"), x, y)
elif get_env_device() == "gcu":
min_val = paddle.finfo(dtype).min
x = paddle.to_tensor(0.0, dtype=dtype)
y = paddle.to_tensor(min_val, dtype=dtype)
expanded_attn_mask = paddle.where(expanded_attn_mask.cast("bool"), x, y).astype(dtype)

Check warning on line 970 in paddlenlp/transformers/llama/modeling_auto.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/llama/modeling_auto.py#L958-L970

Added lines #L958 - L970 were not covered by tests
else:
expanded_attn_mask = paddle.where(expanded_attn_mask.cast("bool"), 0.0, paddle.finfo(dtype).min)
expanded_attn_mask = expanded_attn_mask.astype(dtype)

Check warning on line 973 in paddlenlp/transformers/llama/modeling_auto.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/llama/modeling_auto.py#L972-L973

Added lines #L972 - L973 were not covered by tests
return expanded_attn_mask

def forward(
Expand Down Expand Up @@ -1182,15 +1201,26 @@
masked_lm_labels.unsqueeze(2),
)

masked_lm_loss = paddle.masked_select(masked_lm_loss, masked_lm_loss > 0).astype("float32")
loss = paddle.mean(masked_lm_loss)
# Hack for XPU that doesn't support Allgather yet.
if get_env_device() == "xpu":

Check warning on line 1205 in paddlenlp/transformers/llama/modeling_auto.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/llama/modeling_auto.py#L1205

Added line #L1205 was not covered by tests
# masked_lm_loss = paddle.masked_select(masked_lm_loss, masked_lm_loss > 0).astype("float32")
loss = paddle.mean(masked_lm_loss, axis=-1)

Check warning on line 1207 in paddlenlp/transformers/llama/modeling_auto.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/llama/modeling_auto.py#L1207

Added line #L1207 was not covered by tests
else:
masked_lm_loss = paddle.masked_select(masked_lm_loss, masked_lm_loss > 0).astype("float32")
loss = paddle.mean(masked_lm_loss, axis=-1)

Check warning on line 1210 in paddlenlp/transformers/llama/modeling_auto.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/llama/modeling_auto.py#L1209-L1210

Added lines #L1209 - L1210 were not covered by tests

return loss


class LlamaLMHeadAuto(nn.Layer):
def __init__(self, config: LlamaConfig):
super(LlamaLMHeadAuto, self).__init__()
self.config = config
colwise_placements = (

Check warning on line 1219 in paddlenlp/transformers/llama/modeling_auto.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/llama/modeling_auto.py#L1219

Added line #L1219 was not covered by tests
[dist.Replicate(), dist.Shard(1)]
if self.config.tensor_parallel_degree > 1
else [dist.Replicate(), dist.Replicate()]
)
vocab_size = config.vocab_size
self.weight = self.create_parameter(
shape=[config.hidden_size, vocab_size],
Expand Down