kubernetes/storage/fio-testing/k8s/gcs/generate-data/generate-data-gcs.yaml (126 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ConfigMap: file-generator-config apiVersion: v1 kind: ConfigMap metadata: name: file-generator-config data: generate_files.sh: | #!/bin/bash RETRY_COUNTER=0 MAX_RETRIES=5 SLEEP_TIME=1 # Check if OUTPUT_DIR environment variable is set, otherwise use default if [ -z "${OUTPUT_DIR}" ]; then OUTPUT_DIR="/data" # Default to /data inside the container echo "OUTPUT_DIR not set, using default: ${OUTPUT_DIR}" fi echo "Loading Data in to $OUTPUT_DIR" echo "Waiting for GCS FUSE mount..." OUTPUT_BASE_DIR=$(dirname "${OUTPUT_DIR}") while [ $RETRY_COUNTER -lt $MAX_RETRIES ]; do if mountpoint -q $OUTPUT_BASE_DIR; then echo "[$(date +%Y-%m-%d_%H:%M:%S)] GCS FUSE mount detected at $OUTPUT_BASE_DIR" break fi echo "[$(date +%Y-%m-%d_%H:%M:%S)] GCS FUSE mount not detected, retrying in $SLEEP_TIME seconds..." sleep $SLEEP_TIME COUNTER=$((COUNTER+1)) SLEEP_TIME=$((SLEEP_TIME * 2)) # Exponential backoff if [ $RETRY_COUNTER -eq $MAX_RETRIES ]; then echo "[$(date +%Y-%m-%d_%H:%M:%S)] Failed to detect GCS FUSE mount at $OUTPUT_BASE_DIR after multiple retries" # Call cleanup function if needed cleanup # Make sure the 'cleanup' function is defined if you use it. exit 1 fi done # Create or clean directory for test files (using /data) if [ -d "${OUTPUT_DIR}" ]; then echo "Cleaning existing directory: ${OUTPUT_DIR}" rm -rf "${OUTPUT_DIR}"/* else echo "Creating directory: ${OUTPUT_DIR}" fi mkdir -p "${OUTPUT_DIR}" # Function to generate a single file using FIO generate_file() { file_num=$1 size=$2 fio --name=generate_file_${file_num} \ --ioengine=sync \ --rw=write \ --bs=1m \ --direct=1 \ --size=${size}M \ --filename="${OUTPUT_DIR}/testfile_${file_num}" \ --thread \ --group_reporting \ --minimal # >/dev/null 2>&1 # Redirect output to /dev/null } echo "Starting file generation..." # Generate 1000 files with random sizes between 1-5MB for file_num in $(seq 1 1000); do # Generate random size between 1-5 (MB) size=$(( (RANDOM % 5) + 1 )) # Show progress every 100 files if [ $((file_num % 100)) -eq 0 ]; then echo "Generated $file_num files..." fi generate_file "$file_num" "$size" & # Limit concurrent FIO processes to avoid system overload # Wait if we have too many background processes if [ $((file_num % 20)) -eq 0 ]; then wait fi done # Wait for any remaining background processes to complete wait echo "File generation complete!" # Display summary of generated files (adapted for /data) total_size=$(du -sh "${OUTPUT_DIR}" | cut -f1) echo "Total size of generated files: $total_size" echo "File size distribution:" ls -l "${OUTPUT_DIR}" | awk '{print $5}' | awk '{ sum += $1; n++ } END { print "Average file size: " sum/n/1024/1024 " MB" }' # Optional: Show detailed distribution echo -e "\nDetailed size distribution:" ls -l "${OUTPUT_DIR}" | awk '{print int($5/1024/1024)"MB"}' | sort | uniq -c --- # Job: file-generator-job apiVersion: batch/v1 kind: Job metadata: name: file-generator-job-gcs spec: ttlSecondsAfterFinished: 120 completions: 1 parallelism: 1 template: metadata: annotations: gke-gcsfuse/volumes: "true" gke-gcsfuse/cpu-limit: "1" gke-gcsfuse/memory-limit: "1Gi" gke-gcsfuse/ephemeral-storage-limit: "5Gi" gke-gcsfuse/cpu-request: "50m" gke-gcsfuse/memory-request: "64Mi" gke-gcsfuse/ephemeral-storage-request: "200Mi" spec: priorityClassName: higher-priority nodeSelector: cloud.google.com/compute-class: spot-capacity containers: - name: file-generator image: us-docker.pkg.dev/fsi-research-1/research-images/fio:latest command: ["/bin/bash", "/scripts/generate_files.sh"] env: - name: OUTPUT_DIR value: "/data/test_files" resources: requests: memory: "2Gi" cpu: "1" limits: memory: "4Gi" cpu: "2" volumeMounts: - name: script-volume mountPath: /scripts - name: gcs-fuse-csi-ephemeral mountPath: /data volumes: - name: script-volume configMap: name: file-generator-config - name: gcs-fuse-csi-ephemeral csi: driver: gcsfuse.csi.storage.gke.io volumeAttributes: bucketName: fsi-research-1-us-east4-gke-data-a8w9 #Non cache mountOptions: "implicit-dirs" gcsfuseLoggingSeverity: warning restartPolicy: Never backoffLimit: 4