kubernetes/storage/fio-testing/k8s/gcs/generate-data/generate-data-gcs.yaml (126 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ConfigMap: file-generator-config
apiVersion: v1
kind: ConfigMap
metadata:
name: file-generator-config
data:
generate_files.sh: |
#!/bin/bash
RETRY_COUNTER=0
MAX_RETRIES=5
SLEEP_TIME=1
# Check if OUTPUT_DIR environment variable is set, otherwise use default
if [ -z "${OUTPUT_DIR}" ]; then
OUTPUT_DIR="/data" # Default to /data inside the container
echo "OUTPUT_DIR not set, using default: ${OUTPUT_DIR}"
fi
echo "Loading Data in to $OUTPUT_DIR"
echo "Waiting for GCS FUSE mount..."
OUTPUT_BASE_DIR=$(dirname "${OUTPUT_DIR}")
while [ $RETRY_COUNTER -lt $MAX_RETRIES ]; do
if mountpoint -q $OUTPUT_BASE_DIR; then
echo "[$(date +%Y-%m-%d_%H:%M:%S)] GCS FUSE mount detected at $OUTPUT_BASE_DIR"
break
fi
echo "[$(date +%Y-%m-%d_%H:%M:%S)] GCS FUSE mount not detected, retrying in $SLEEP_TIME seconds..."
sleep $SLEEP_TIME
COUNTER=$((COUNTER+1))
SLEEP_TIME=$((SLEEP_TIME * 2)) # Exponential backoff
if [ $RETRY_COUNTER -eq $MAX_RETRIES ]; then
echo "[$(date +%Y-%m-%d_%H:%M:%S)] Failed to detect GCS FUSE mount at $OUTPUT_BASE_DIR after multiple retries"
# Call cleanup function if needed
cleanup # Make sure the 'cleanup' function is defined if you use it.
exit 1
fi
done
# Create or clean directory for test files (using /data)
if [ -d "${OUTPUT_DIR}" ]; then
echo "Cleaning existing directory: ${OUTPUT_DIR}"
rm -rf "${OUTPUT_DIR}"/*
else
echo "Creating directory: ${OUTPUT_DIR}"
fi
mkdir -p "${OUTPUT_DIR}"
# Function to generate a single file using FIO
generate_file() {
file_num=$1
size=$2
fio --name=generate_file_${file_num} \
--ioengine=sync \
--rw=write \
--bs=1m \
--direct=1 \
--size=${size}M \
--filename="${OUTPUT_DIR}/testfile_${file_num}" \
--thread \
--group_reporting \
--minimal
# >/dev/null 2>&1 # Redirect output to /dev/null
}
echo "Starting file generation..."
# Generate 1000 files with random sizes between 1-5MB
for file_num in $(seq 1 1000); do
# Generate random size between 1-5 (MB)
size=$(( (RANDOM % 5) + 1 ))
# Show progress every 100 files
if [ $((file_num % 100)) -eq 0 ]; then
echo "Generated $file_num files..."
fi
generate_file "$file_num" "$size" &
# Limit concurrent FIO processes to avoid system overload
# Wait if we have too many background processes
if [ $((file_num % 20)) -eq 0 ]; then
wait
fi
done
# Wait for any remaining background processes to complete
wait
echo "File generation complete!"
# Display summary of generated files (adapted for /data)
total_size=$(du -sh "${OUTPUT_DIR}" | cut -f1)
echo "Total size of generated files: $total_size"
echo "File size distribution:"
ls -l "${OUTPUT_DIR}" | awk '{print $5}' | awk '{ sum += $1; n++ } END { print "Average file size: " sum/n/1024/1024 " MB" }'
# Optional: Show detailed distribution
echo -e "\nDetailed size distribution:"
ls -l "${OUTPUT_DIR}" | awk '{print int($5/1024/1024)"MB"}' | sort | uniq -c
---
# Job: file-generator-job
apiVersion: batch/v1
kind: Job
metadata:
name: file-generator-job-gcs
spec:
ttlSecondsAfterFinished: 120
completions: 1
parallelism: 1
template:
metadata:
annotations:
gke-gcsfuse/volumes: "true"
gke-gcsfuse/cpu-limit: "1"
gke-gcsfuse/memory-limit: "1Gi"
gke-gcsfuse/ephemeral-storage-limit: "5Gi"
gke-gcsfuse/cpu-request: "50m"
gke-gcsfuse/memory-request: "64Mi"
gke-gcsfuse/ephemeral-storage-request: "200Mi"
spec:
priorityClassName: higher-priority
nodeSelector:
cloud.google.com/compute-class: spot-capacity
containers:
- name: file-generator
image: us-docker.pkg.dev/fsi-research-1/research-images/fio:latest
command: ["/bin/bash", "/scripts/generate_files.sh"]
env:
- name: OUTPUT_DIR
value: "/data/test_files"
resources:
requests:
memory: "2Gi"
cpu: "1"
limits:
memory: "4Gi"
cpu: "2"
volumeMounts:
- name: script-volume
mountPath: /scripts
- name: gcs-fuse-csi-ephemeral
mountPath: /data
volumes:
- name: script-volume
configMap:
name: file-generator-config
- name: gcs-fuse-csi-ephemeral
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: fsi-research-1-us-east4-gke-data-a8w9 #Non cache
mountOptions: "implicit-dirs"
gcsfuseLoggingSeverity: warning
restartPolicy: Never
backoffLimit: 4