def main()

in projects/imagen-voice-captioning/imagen-voice-captioning.py [0:0]


def main():
    config = parse_config_args(CONFIG_FILE)

    input_dev = int(config["parameters"]["input"])

    # Set the environment variable GOOGLE_APPLICATION_CREDENTIALS to
    # the path of your Google Cloud service account key file.
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = config["parameters"][
        "credentials"
    ]

    # Instantiates a TTS client
    tts_client = texttospeech.TextToSpeechClient()

    # Open the camera feed
    print("Opening camera: {}".format(input_dev))
    # cap = cv2.VideoCapture(input_dev)
    cap = cv2.VideoCapture(input_dev)
    cap.set(3, 640)
    cap.set(4, 480)

    if not cap.isOpened():
        print("Cannot open camera {}".format(input_dev))
        exit(1)

    print("Select the camera view window by clicking it")
    print("Press <space> to caption the camera view. Press q to quit")

    byte_io = io.BytesIO()

    while cap.isOpened():
        ret, frame = cap.read()
        # if frame is read correctly ret is True
        if not ret:
            continue
        cv2.imshow("Imagen Voice Captioning", frame)
        pressed = cv2.waitKey(1)
        if pressed == ord(" "):
            # Query imagen
            byte_io = convert_image(frame, byte_io)
            captions = query_imagen_caption(byte_io.read())

            if captions:
                audio = None
                # Query TTS
                audio = tts_captions(captions, tts_client)

                if audio:
                    audiosegment = AudioSegment.from_file(
                        io.BytesIO(audio), format="mp3"
                    )
                    play(audiosegment)
        elif pressed == ord("q"):
            break

    cap.release()
    cv2.destroyAllWindows()
    exit(0)