in src/nanotron/trainer.py [0:0]
def get_cpu_logitems():
# Add CPU memory usage metrics
memory = psutil.virtual_memory()
cpu_memory_log_entries = [
LogItem("cpu_memory/total", memory.total, "human_format"),
LogItem("cpu_memory/available_bytes", memory.available, "human_format"),
LogItem("cpu_memory/used_bytes", memory.used, "human_format"),
LogItem("cpu_memory/percent", memory.percent, "human_format"),
]
# Add swap memory usage metrics
swap = psutil.swap_memory()
swap_memory_log_entries = [
LogItem("swap_memory/total", swap.total, "human_format"),
LogItem("swap_memory/free", swap.free, "human_format"),
LogItem("swap_memory/used", swap.used, "human_format"),
LogItem("swap_memory/percent", swap.percent, "human_format"),
]
# Add detailed process memory info for main process and workers
process = psutil.Process()
worker_processes = []
# Get all child processes
try:
worker_processes = process.children(recursive=True)
except psutil.NoSuchProcess:
pass
# Log main process memory
mem_info = process.memory_info()
process_memory_log_entries = [
LogItem("process_memory/main/rss", mem_info.rss, "human_format"),
LogItem("process_memory/main/shared", mem_info.shared, "human_format"),
LogItem("process_memory/main/vms", mem_info.vms, "human_format"),
LogItem("process_memory/main/text", mem_info.text, "human_format"),
LogItem("process_memory/main/data", mem_info.data, "human_format"),
LogItem("process_memory/main/lib", mem_info.lib, "human_format"),
LogItem("process_memory/main/dirty", mem_info.dirty, "human_format"),
]
# Log worker process memory
for idx, worker in enumerate(worker_processes):
try:
worker_mem = worker.memory_info()
process_memory_log_entries.extend(
[
LogItem(f"process_memory/worker_{idx}/rss", worker_mem.rss, "human_format"),
LogItem(f"process_memory/worker_{idx}/shared", worker_mem.shared, "human_format"),
LogItem(f"process_memory/worker_{idx}/vms", worker_mem.vms, "human_format"),
LogItem(f"process_memory/worker_{idx}/text", worker_mem.text, "human_format"),
LogItem(f"process_memory/worker_{idx}/data", worker_mem.data, "human_format"),
LogItem(f"process_memory/worker_{idx}/lib", worker_mem.lib, "human_format"),
LogItem(f"process_memory/worker_{idx}/dirty", worker_mem.dirty, "human_format"),
]
)
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
return cpu_memory_log_entries + swap_memory_log_entries + process_memory_log_entries