def split_csv()

in fast/project-templates/secops-anonymization-pipeline/source/shared/utils.py [0:0]


def split_csv(bucket_name, blob_name, file_size):
  """Splits a CSV file into smaller chunks and uploads them back to the bucket.

    Args:
      bucket_name: The name of the GCS bucket.
      blob_name: The name of the CSV blob in the bucket.
      max_file_size: The maximum size of each chunk in bytes.
    """
  storage_client = storage.Client()
  bucket = storage_client.bucket(bucket_name)
  blob = bucket.blob(blob_name)

  # Download the blob to a local file
  temp_file = '/tmp/temp.csv'
  blob.download_to_filename(temp_file)

  file = open(temp_file, encoding="utf8")
  numline = sum(1 for row in csv.reader(file))

  # Read the CSV file in chunks
  chunk_number = math.ceil(numline * MAX_FILE_SIZE / file_size)
  index = 0
  lines = []
  with open(temp_file, 'r', encoding="utf8") as f_in:
    reader = csv.reader(f_in, delimiter='\n')
    for line in reader:
      lines.append(line[0] + "\n")
      if len(lines) == chunk_number:
        chunk_filename = f'{blob_name.split(".")[0]}_{index}.log'
        chunk_path = f'/tmp/temp-{index}.csv'
        with open(chunk_path, 'w') as fout:
          fout.writelines(lines)
        chunk_blob = bucket.blob(f'{chunk_filename}')
        chunk_blob.upload_from_filename(chunk_path)
        print(f'Uploaded {chunk_filename} to {bucket_name}')
        os.remove(chunk_path)  # Remove the local chunk file
        index += 1
        lines = []

    chunk_filename = f'{blob_name.split(".")[0]}_{index}.log'
    chunk_path = f'/tmp/temp-{index}.csv'
    with open(chunk_path, 'w') as fout:
      fout.writelines(lines)
    chunk_blob = bucket.blob(f'{chunk_filename}')
    chunk_blob.upload_from_filename(chunk_path)
    print(f'Uploaded {chunk_filename} to {bucket_name}')
    os.remove(chunk_path)  # Remove the local chunk file
    index += 1
    lines = []

  # Remove the temporary file
  os.remove(temp_file)

  # remove old log file
  blob = bucket.blob(blob_name)
  blob.delete()