components/otelopscol/receiver/nvmlreceiver/scraper.go (73 lines of code) (raw):
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build gpu
// +build gpu
package nvmlreceiver
import (
"context"
"fmt"
"time"
"go.opentelemetry.io/collector/component"
"go.opentelemetry.io/collector/pdata/pcommon"
"go.opentelemetry.io/collector/pdata/pmetric"
"go.opentelemetry.io/collector/receiver"
"github.com/GoogleCloudPlatform/opentelemetry-operations-collector/components/otelopscol/receiver/nvmlreceiver/internal/metadata"
)
type nvmlScraper struct {
config *Config
settings receiver.Settings
client *nvmlClient
mb *metadata.MetricsBuilder
}
func newNvmlScraper(config *Config, settings receiver.Settings) *nvmlScraper {
return &nvmlScraper{config: config, settings: settings}
}
func (s *nvmlScraper) start(_ context.Context, _ component.Host) error {
var err error
s.client, err = newClient(s.config, s.settings.Logger)
if err != nil {
return err
}
startTime := pcommon.NewTimestampFromTime(time.Now())
mbConfig := metadata.DefaultMetricsBuilderConfig()
mbConfig.Metrics = s.config.Metrics
s.mb = metadata.NewMetricsBuilder(
mbConfig, s.settings, metadata.WithStartTime(startTime))
return nil
}
func (s *nvmlScraper) stop(_ context.Context) error {
if s.client != nil {
return s.client.cleanup()
}
return nil
}
func (s *nvmlScraper) scrape(_ context.Context) (pmetric.Metrics, error) {
deviceMetrics, err := s.client.collectDeviceMetrics()
for _, metric := range deviceMetrics {
timestamp := pcommon.NewTimestampFromTime(metric.time)
model := s.client.getDeviceModelName(metric.gpuIndex)
UUID := s.client.getDeviceUUID(metric.gpuIndex)
gpuIndex := fmt.Sprintf("%d", metric.gpuIndex)
switch metric.name {
case "nvml.gpu.utilization":
s.mb.RecordNvmlGpuUtilizationDataPoint(
timestamp, metric.asFloat64(), model, gpuIndex, UUID)
case "nvml.gpu.memory.bytes_used":
s.mb.RecordNvmlGpuMemoryBytesUsedDataPoint(
timestamp, metric.asInt64(), model, gpuIndex, UUID, metadata.AttributeMemoryStateUsed)
case "nvml.gpu.memory.bytes_free":
s.mb.RecordNvmlGpuMemoryBytesUsedDataPoint(
timestamp, metric.asInt64(), model, gpuIndex, UUID, metadata.AttributeMemoryStateFree)
}
}
processMetrics := s.client.collectProcessMetrics()
for _, metric := range processMetrics {
timestamp := pcommon.NewTimestampFromTime(metric.time)
model := s.client.getDeviceModelName(metric.gpuIndex)
UUID := s.client.getDeviceUUID(metric.gpuIndex)
gpuIndex := fmt.Sprintf("%d", metric.gpuIndex)
s.mb.RecordNvmlGpuProcessesUtilizationDataPoint(
timestamp, float64(metric.lifetimeGpuUtilization)/100.0, model, gpuIndex, UUID, int64(metric.processPid),
metric.processName, metric.command, metric.commandLine, metric.owner)
s.mb.RecordNvmlGpuProcessesMaxBytesUsedDataPoint(
timestamp, int64(metric.lifetimeGpuMaxMemory), model, gpuIndex, UUID, int64(metric.processPid),
metric.processName, metric.command, metric.commandLine, metric.owner)
}
return s.mb.Emit(), err
}