in projects/imagen-voice-captioning/imagen-voice-captioning.py [0:0]
def main():
config = parse_config_args(CONFIG_FILE)
input_dev = int(config["parameters"]["input"])
# Set the environment variable GOOGLE_APPLICATION_CREDENTIALS to
# the path of your Google Cloud service account key file.
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = config["parameters"][
"credentials"
]
# Instantiates a TTS client
tts_client = texttospeech.TextToSpeechClient()
# Open the camera feed
print("Opening camera: {}".format(input_dev))
# cap = cv2.VideoCapture(input_dev)
cap = cv2.VideoCapture(input_dev)
cap.set(3, 640)
cap.set(4, 480)
if not cap.isOpened():
print("Cannot open camera {}".format(input_dev))
exit(1)
print("Select the camera view window by clicking it")
print("Press <space> to caption the camera view. Press q to quit")
byte_io = io.BytesIO()
while cap.isOpened():
ret, frame = cap.read()
# if frame is read correctly ret is True
if not ret:
continue
cv2.imshow("Imagen Voice Captioning", frame)
pressed = cv2.waitKey(1)
if pressed == ord(" "):
# Query imagen
byte_io = convert_image(frame, byte_io)
captions = query_imagen_caption(byte_io.read())
if captions:
audio = None
# Query TTS
audio = tts_captions(captions, tts_client)
if audio:
audiosegment = AudioSegment.from_file(
io.BytesIO(audio), format="mp3"
)
play(audiosegment)
elif pressed == ord("q"):
break
cap.release()
cv2.destroyAllWindows()
exit(0)