def __init_

def init()

in infra-as-code/modules/ingest-pipeline/cf-transcript-correction/lib.py [0:0]
56 lines of code
1 McCabe index (conditional complexity)

  def __init__(
    self, 
    project_id,
    location_id,
    model_name,
    transcript_bucket_id,
    transcript_file_name,
    formatted_audio_file_name, 
    formatted_audio_bucket_id,
    ingest_record_bucket_id,
    original_file_name,
    client_specific_constraints,
    client_specific_context,
    few_shot_examples
    ):

    self.project_id = project_id
    self.location_id = location_id
    self.model_name = model_name
    self.formatted_audio_bucket_id = formatted_audio_bucket_id
    self.formatted_audio_file_name = formatted_audio_file_name
    self.transcript_bucket_id = transcript_bucket_id
    self.transcript_file_name = transcript_file_name
    self.gemini_transcript = str()

    creds = self.get_credentials()
    self.storage_client = storage.Client(project=self.project_id, credentials=creds)

    self.original_transcript = self.download_from_gcs(self.transcript_bucket_id, self.transcript_file_name)
    self.transcript = self.extract_transcripts(self.original_transcript)

    self.record_keeper = RecordKeeper(ingest_record_bucket_id, original_file_name, self.storage_client)
    self.event_dict['original_file_name'] = original_file_name

    self.client_specific_constraints = client_specific_constraints
    self.client_specific_context = client_specific_context
    self.few_shot_examples = few_shot_examples


    self.prompt = f"""
    <OBJECTIVE_AND_PERSONA>
      You are an expert audio transcription editor. Your primary goal is to correct errors in transcripts while preserving the original JSON format. You have a strong understanding of the provided terminology and are familiar with common transcription challenges.
    </OBJECTIVE_AND_PERSONA>

    <INSTRUCTIONS>
      You will receive an audio file and its corresponding transcript in JSON format. Your job is to:
      1. Carefully listen to the entire audio file.
      2. Review the entire transcript provided.
      3. Compare, Identify and correct any discrepancies between the audio and the transcript. Pay close attention to:
          * **Key Terms:** Ensure accuracy in transcribing all key terms, names, and phrases specific to the client.  
          * **Speaker Misattribution:** Correctly identify and label different speakers.
          * **General Errors:** Fix misspellings, grammatical errors, and any other inaccuracies.
          * **Keep the fillers:** Keep any fillers used. Example: "Mhm"
      3. Preserve the original JSON structure, including all key-value pairs, nesting, and formatting. Only the text content within the transcript should be modified, keep the same amount of objects in the input transcript.

    </INSTRUCTIONS>

    <CONSTRAINTS>
    Dos and don'ts for the following aspects:
      {self.client_specific_constraints}
    </CONSTRAINTS>

    <CONTEXT>
      {self.client_specific_context}
    </CONTEXT>

    <FEW_SHOT_EXAMPLES>
      Example of transcripts with correct terminology:
        {self.few_shot_examples}
    </FEW_SHOT_EXAMPLES>

    <INPUTS>
      Transcript: {self.transcript} 
    </INPUTS>

    <OUTPUTS>
      Return the transcript with the correct response_schema
    </OUTPUTS>

    Remember that before you answer, you must check to see if the answer complies with your mission. If not, you must respond, "I am not able to answer this question"
    """

    self.response_schema = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "index": {
                "type": "integer",
                "description": "Index position of the transcript"
            },
            "transcript": {
                "type": "string",
                "description": "The transcript text"
            },
            "channelTag": {
                "type": "integer",
                "description": "Channel identifier"
            }
        },
        "required": ["index", "transcript", "channelTag"]
        }
    }

    print(f'Starting transcript fix on: {self.transcript_file_name}')
    print(f'Using prompt: {self.prompt}')
def __init__()

def init()