in tools/perf-scale-workload/model.py [0:0]
def printModelSummary(dimensionsMetrics, dimensionsEvents, metricInterval, eventInterval, wide = False):
print("Dimensions for metrics: {:,}".format(len(dimensionsMetrics)))
print("Dimensions for events: {:,}".format(len(dimensionsEvents)))
eventsDimensionBytes = np.sum([len(getattr(dimensionsEvents[0], dimName)) * 2 for dimName in dimensionsEvents[0]._fields])
metricsDimensionBytes = np.sum([len(getattr(dimensionsMetrics[0], dimName)) * 2 for dimName in dimensionsMetrics[0]._fields])
metricsMeasureBytesTotal = np.sum([len(x) for x in measuresForMetrics])
eventsMeasureBytesTotal = np.sum([len(x) for x in measuresForEvents])
print("Dimension bytes: Events: {}, Metrics: {}".format(eventsDimensionBytes, metricsDimensionBytes))
print("Bytes for names: Metrics: {}, Events: {}".format(metricsMeasureBytesTotal, eventsMeasureBytesTotal))
## In the narrow model, each measure data point becomes a row. On the other hand, in the wide model,
## 20 metrics become one two, and 5 events become another. So, instead of having 25 rows, we have 2.
## Similarly, if we count the number of time series as the distinct combinations of dimensions names, values, and measure names,
## the wide model also sees a significant reduction in the number of time series.
## Each row in wide model becomes wider, but the repeated dimensions and timestamps go away, which is why the total
## data size, ingestion volume etc. should also be lower.
if wide:
numTimeseries = len(dimensionsMetrics) + len(dimensionsEvents)
numDataPointsPerSecond = round((1 / metricInterval) * len(dimensionsMetrics) + (1 / eventInterval) * len(dimensionsEvents))
numMeasuresPerSecond = round((1 / metricInterval) * len(dimensionsMetrics) * len(measuresForMetrics) + (1 / eventInterval) * len(dimensionsEvents) * len(measuresForEvents))
numDataPointsPerHour = 3600 * numDataPointsPerSecond
numMetricsPerSecond = round((1 / metricInterval) * len(dimensionsMetrics))
numEventsPerSecond = round((1 / eventInterval) * len(dimensionsEvents))
## Metrics row size is the size of dimensions, plus "metrics" as measure name, timestamp, and 8 bytes for each metric
avgMetricsRowSize = metricsDimensionBytes + 7 + 8 + len(measuresForMetrics) * 8 + metricsMeasureBytesTotal
avgEventsRowSize = eventsDimensionBytes + 6 + 8 + eventsMeasureBytesTotal
ingestionVolume = round((numMetricsPerSecond * avgMetricsRowSize + numEventsPerSecond * avgEventsRowSize) / (1024.0 * 1024.0), 2)
else:
numTimeseries = len(dimensionsMetrics) * len(measuresForMetrics) + len(dimensionsEvents) * len(measuresForEvents)
numDataPointsPerSecond = round((1 / metricInterval) * len(dimensionsMetrics) * len(measuresForMetrics) + (1 / eventInterval) * len(dimensionsEvents) * len(measuresForEvents))
numMeasuresPerSecond = numDataPointsPerSecond
numDataPointsPerHour = 3600 * numDataPointsPerSecond
numMetricsPerSecond = round((1 / metricInterval) * len(dimensionsMetrics) * len(measuresForMetrics))
numEventsPerSecond = round((1 / eventInterval) * len(dimensionsEvents) * len(measuresForEvents))
metricsMeasureBytes = np.average([len(x) for x in measuresForMetrics])
eventsMeasureBytes = np.average([len(x) for x in measuresForEvents])
ingestionVolume = round((numMetricsPerSecond * (metricsDimensionBytes + metricsMeasureBytes + 16) + numEventsPerSecond * (eventsDimensionBytes + eventsMeasureBytes + 16)) / (1024.0 * 1024.0), 2)
dataSizePerHour = round(ingestionVolume * 3600 / 1024.0, 2)
dataSizePerDay = round(dataSizePerHour * 24, 2)
dataSizePerYear = round(dataSizePerDay * 365 / 1024.0, 2)
avgRowSizeBytes = round(ingestionVolume * 1024 * 1024 / numDataPointsPerSecond, 2)
print("avg row size: {} Bytes".format(avgRowSizeBytes))
print("Number of timeseries: {:,}. Avg. data points per second: {:,}. Avg. no. of metrics per second: {:,} Avg. data points per hour: {:,}".format(
numTimeseries, numDataPointsPerSecond, numMeasuresPerSecond, numDataPointsPerHour))
print("Avg. Ingestion volume: {:,} MB/s. Data size per hour: {:,} GB. Data size per day: {:,} GB. Data size per year: {:,} TB".format(
ingestionVolume, dataSizePerHour, dataSizePerDay, dataSizePerYear))