in infra-as-code/modules/ingest-pipeline/cf-stt-transcript/lib.py [0:0]
def order_transcript(self, bucket_name, filename):
"""Downloads the transcript and orders the transcript by offset
in order to have the turns correctly and avoids the empty transcripts
and uploads the ordered transcript to gcs
Args:
bucket_name (str): Bucket name
filename (str): Blob name
"""
bucket = self.storage_client.bucket(bucket_name)
blob = bucket.blob(filename)
contents = blob.download_as_text()
transcript_data = json.loads(contents)
sorted_results = sorted(
(item for item in transcript_data["results"] if "alternatives" in item),
key=lambda x: float(x["resultEndOffset"].replace("s", "")) )
transcript_data["results"] = sorted_results
modified_contents = json.dumps(transcript_data, indent=2)
blob.upload_from_string(modified_contents, content_type='application/json')
print(f"Modified JSON file '{filename}' successfully updated in bucket '{bucket_name}'.")