transcoder/source/file/LineDelimitedFileMessageSource.py (34 lines of code) (raw):

# # Copyright 2022 Google LLC # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import sys import base64 from transcoder.source.LineEncoding import LineEncoding from transcoder.source.file import FileMessageSource class LineDelimitedFileMessageSource(FileMessageSource): """Reads line delimited files and yields individual records for message consumption""" @staticmethod def source_type_identifier(): return 'line_delimited' def __init__(self, file_path: str, encoding: str, skip_lines: int = 0, message_skip_bytes: int = 0, line_encoding: LineEncoding = None): super().__init__(file_path, file_open_mode='rt', file_encoding=encoding) self.message_skip_bytes = message_skip_bytes self.line_encoding = line_encoding self.skip_lines = skip_lines def prepare(self): # file_size is only determinable on seekable input if sys.stdin.seekable() is True and self.file_size == 0: return if self.skip_lines > 0: for _ in range(self.skip_lines): self.file_handle.readline() def get_message_iterator(self): while line := self.file_handle.readline(): self.increment_count() yield self.decode_message(line) self._log_percentage_read() def decode_message(self, record): """Performs line decoding and message skip bytes for line encoded cases""" message = record if self.line_encoding == LineEncoding.BASE_64.value: message = base64.b64decode(record) elif self.line_encoding == LineEncoding.BASE_64_URL_SAFE.value: message = base64.urlsafe_b64decode(record) if self.message_skip_bytes > 0 and isinstance(message, bytes): # print(''.join('{:02x}'.format(x) for x in message)) message = message[self.message_skip_bytes:] # print(''.join('{:02x}'.format(x) for x in message)) return message