feat: add support for fsdp2 strategy in trainer #21184

deependujha · 2025-09-08T16:30:34Z

What does this PR do?

Fixes #<issue_number>

import lightning as L
import torch
import torch.nn as nn
import torch.nn.functional as F
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.strategies import FSDP2Strategy
from torch.utils.data import DataLoader, TensorDataset


class SimpleModel(L.LightningModule):
    def __init__(self, input_dim=32, hidden_dim=64, output_dim=10):
        super().__init__()
        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.layer2 = nn.Linear(hidden_dim, hidden_dim)
        self.layer3 = nn.Linear(hidden_dim, output_dim)
        self.save_hyperparameters()
        
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        self.log("train_loss", loss, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        self.log("val_loss", loss, prog_bar=True)
        
    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        self.log("test_loss", loss, prog_bar=True)
        
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)


class RandomDataModule(L.LightningDataModule):
    def __init__(self, input_dim=32, output_dim=10, num_samples=1000, batch_size=32):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.num_samples = num_samples
        self.batch_size = batch_size
        
    def setup(self, stage=None):
        # Generate random data
        if stage == 'fit' or stage is None:
            # Training data
            x_train = torch.randn(self.num_samples, self.input_dim)
            y_train = torch.randint(0, self.output_dim, (self.num_samples,))
            self.train_dataset = TensorDataset(x_train, y_train)
            
            # Validation data
            x_val = torch.randn(self.num_samples // 5, self.input_dim)
            y_val = torch.randint(0, self.output_dim, (self.num_samples // 5,))
            self.val_dataset = TensorDataset(x_val, y_val)
            
        if stage == 'test' or stage is None:
            # Test data
            x_test = torch.randn(self.num_samples // 5, self.input_dim)
            y_test = torch.randint(0, self.output_dim, (self.num_samples // 5,))
            self.test_dataset = TensorDataset(x_test, y_test)
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)
    
    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)


def main():
    # Set up model and data
    input_dim = 32
    hidden_dim = 64
    output_dim = 10
    
    with torch.device("meta"):
        model = SimpleModel(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim)
    data_module = RandomDataModule(input_dim=input_dim, output_dim=output_dim)
    
    # Set up the checkpoint callback to save the best model
    checkpoint_callback = ModelCheckpoint(
        monitor="val_loss",
        dirpath="checkpoints/",
        filename="model-{epoch:02d}-{val_loss:.2f}",
        save_top_k=1,
        mode="min",
    )

    # # Set up the FSDP strategy
    # fsdp_strategy = FSDP2Strategy(
    # )
    
    # Set up the trainer
    trainer = L.Trainer(
        max_epochs=10,
        callbacks=[checkpoint_callback],
        strategy="fsdp2",
        accelerator="auto",  # Automatically detect available accelerator
        devices="auto",      # Use all available devices
    )
    
    # Train the model
    trainer.fit(model, data_module)
    
    if trainer.is_global_zero:
        print(f"Done training! Best model saved at: {checkpoint_callback.best_model_path}")


if __name__ == "__main__":
    main()

Before submitting

Was this discussed/agreed via a GitHub issue? (not for typos and docs)
Did you read the contributor guideline, Pull Request section?
Did you make sure your PR does only one thing, instead of bundling different changes together?
Did you make sure to update the documentation with your changes? (if necessary)
Did you write any new necessary tests? (not for typos and docs)
Did you verify new and existing tests pass locally with your changes?
Did you list all the breaking changes introduced by this pull request?
Did you update the CHANGELOG? (not for typos, docs, test updates, or minor internal changes/refactors)

PR review

Anyone in the community is welcome to review the PR.
Before you start reviewing, make sure you have read the review guidelines. In short, see the following bullet-list:

Reviewer checklist

Is this pull request ready for review? (if not, please submit in draft mode)
Check that all items from Before submitting are resolved
Make sure the title is self-explanatory and the description concisely explains the PR
Add labels and milestones (and optionally projects) to the PR so it can be classified

📚 Documentation preview 📚: https://pytorch-lightning--21184.org.readthedocs.build/en/21184/

codecov · 2025-09-10T11:59:05Z

Codecov Report

❌ Patch coverage is 49.04110% with 186 lines in your changes missing coverage. Please review.
✅ Project coverage is 86%. Comparing base (3998b5d) to head (029ebff).
⚠️ Report is 8 commits behind head on master.

Additional details and impacted files

@@           Coverage Diff            @@
##           master   #21184    +/-   ##
========================================
- Coverage      87%      86%    -1%     
========================================
  Files         269      271     +2     
  Lines       23665    24051   +386     
========================================
+ Hits        20642    20710    +68     
- Misses       3023     3341   +318

SkafteNicki

Probably in a follow up PR:

documentation similar to https://github.com/Lightning-AI/pytorch-lightning/blob/master/docs/source-fabric/advanced/model_parallel/fsdp.rst
add fsdp2 to fabric?

src/lightning/fabric/utilities/init.py

src/lightning/pytorch/strategies/fsdp2.py

Co-authored-by: Nicki Skafte Detlefsen <[email protected]>

for more information, see https://pre-commit.ci

Borda · 2025-09-11T08:10:24Z

add fsdp2 to fabric?

I would start with FSDP2 in Fabric

deependujha · 2025-09-11T11:18:42Z

✅ This PR is functionally complete and introduces initial support for FSDP2 in PyTorch Lightning trainer.

Originally, I planned to follow up with:

Gradient accumulation support for FSDP2.
Discussion & improvements around best practices for wrapping (e.g., top-level fully_shard(model) vs. selectively wrapping layers like nn.Linear, transformer blocks, etc.).

However, per discussion with @tchaton, we’re ⚠️ pausing this work for now until the PyTorch Lightning Enterprise direction is clear.

I’ll leave this PR open so it can be easily revived later, if we decide to.

update

9f537c1

github-actions bot added the pl Generic label for PyTorch Lightning package label Sep 8, 2025

deependujha added 2 commits September 9, 2025 10:52

add fsdp2 precision plugin

d8a4d84

time to test fsdp2

35bf1a2

github-actions bot added the fabric lightning.fabric.Fabric label Sep 9, 2025

deependujha added 5 commits September 10, 2025 05:24

works. i'm still worthy

cc6de82

update

daa8667

fix mypy issues and install-pkg ci

9279f49

update

caa2dd9

Merge branch 'master' into feat/add-support-for-fsdp2-strategy

28a2359

deependujha marked this pull request as ready for review September 10, 2025 09:23

deependujha requested review from lantiga, Borda, tchaton, justusschock and ethanwharris as code owners September 10, 2025 09:23

deependujha added 2 commits September 10, 2025 16:06

fsdp2 tests started

6487295

fsdp2 tests

9389669

deependujha added 4 commits September 11, 2025 10:28

could it be

b3ce371

update

224a125

meow

6b05701

update

3e76d9e

SkafteNicki reviewed Sep 11, 2025

View reviewed changes

src/lightning/fabric/utilities/init.py Show resolved Hide resolved

src/lightning/pytorch/strategies/fsdp2.py Outdated Show resolved Hide resolved

deependujha and others added 3 commits September 11, 2025 13:18

Update src/lightning/fabric/utilities/init.py

e400215

Co-authored-by: Nicki Skafte Detlefsen <[email protected]>

[pre-commit.ci] auto fixes from pre-commit.com hooks

a84b9b1

for more information, see https://pre-commit.ci

update

cf6bbf1

nitpick. and pause fsdp2 dev for now

029ebff

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

feat: add support for fsdp2 strategy in trainer #21184

feat: add support for fsdp2 strategy in trainer #21184

Uh oh!

deependujha commented Sep 8, 2025 •

edited

Loading

Uh oh!

codecov bot commented Sep 10, 2025 •

edited

Loading

Uh oh!

SkafteNicki left a comment

Uh oh!

Uh oh!

Uh oh!

Borda commented Sep 11, 2025

Uh oh!

deependujha commented Sep 11, 2025 •

edited

Loading

Uh oh!

Uh oh!

feat: add support for fsdp2 strategy in trainer #21184

Are you sure you want to change the base?

feat: add support for fsdp2 strategy in trainer #21184

Uh oh!

Conversation

deependujha commented Sep 8, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

What does this PR do?

PR review

Uh oh!

codecov bot commented Sep 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Codecov Report

Uh oh!

SkafteNicki left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Borda commented Sep 11, 2025

Uh oh!

deependujha commented Sep 11, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

deependujha commented Sep 8, 2025 •

edited

Loading

codecov bot commented Sep 10, 2025 •

edited

Loading

deependujha commented Sep 11, 2025 •

edited

Loading