in source/glue/jobs/forecast_etl.py [0:0]
def _detect_header(self):
"""
Detects a header in a CSV file in S3
:return: True or False (header present or not present)
"""
if not self.source.endswith(".csv"):
source = self._get_export_path()
else:
source = self.source
bucket, key = self._split_s3_url(source)
obj = self.s3_cli.get_object(Bucket=bucket, Key=key)
# read up to 100 lines of the .csv file
# take only non-empty lines as a sample
sample = ""
for index, line in zip(range(100), obj["Body"].iter_lines()):
if line:
sample += line.decode() + "\n"
# try manual detection (best for small datasets) - all header fields present
first_line = sample.splitlines()[0]
fields = first_line.split(",")
if sorted(list(fields)) == sorted(list(self.schema.fields)):
logger.info("%s header present" % (self.source))
return True
# try manual detection (best for small datasets) - no header fields present
if not any(field in list(self.schema.fields) for field in fields):
logger.info("%s header absent" % (self.source))
return False
# try auto detection if manual detection didn't work - this works well for larger files
try:
has_header = csv.Sniffer().has_header(sample)
logger.info(
"%s header %s" % (self.source, "present" if has_header else "absent")
)
except csv.Error:
# often caused by not being able to determine the delimiter - we can assume there is no header - it will be
# filtered out by glue transforms and joins, so this is not a concern.
has_header = False
logger.warning(
"%s has input data quality issues please verify your data set quality"
% self.source
)
return has_header