perfmetrics/scripts/ml_tests/checkpoint/Jax/emulated_checkpoints.py (40 lines of code) (raw):
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import jax
import jax.numpy as jnp
import optax
from flax import linen as nn
from flax.training import train_state
from flax.training import checkpoints
import argparse
# Mock model definition.
class SimpleModel(nn.Module):
@nn.compact
def __call__(self, x):
features = [16384, 8192, 4096, 2048, 1024, 512, 256, 128, 1]
for feature in features:
x = nn.Dense(features=feature)(x)
x = nn.relu(x)
return x
# Mock training step.
def train_step(state, batch):
def loss_fn(params):
preds = state.apply_fn(params, batch['x'])
loss = jnp.mean(jnp.square(preds - batch['y']))
return loss
grad_fn = jax.value_and_grad(loss_fn)
loss, grads = grad_fn(state.params)
state = state.apply_gradients(grads=grads)
return state, loss
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Train a simple model and save checkpoints.')
parser.add_argument('--checkpoint_dir', type=str, required=True, help='Directory to save checkpoints')
parser.add_argument('--num_train_steps', type=int, default=2000, help='Number of training steps') # Added argument for num_train_steps
args = parser.parse_args()
# Sample data.
key = jax.random.PRNGKey(0)
x = jax.random.normal(key, (10, 5))
y = jax.random.normal(key, (10, 1))
# Initialize model and optimizer.
model = SimpleModel()
params = model.init(key, x)
optimizer = optax.adam(learning_rate=0.01)
# Create train state.
state = train_state.TrainState.create(apply_fn=model.apply, params=params, tx=optimizer)
# Mock training step.
state, loss = train_step(state, {'x': x, 'y': y})
# Save checkpoint to local directory
for step in range(args.num_train_steps):
if step % 200 == 0:
checkpoints.save_checkpoint(args.checkpoint_dir, state, step, keep=100, prefix='checkpoint_')