def _detect_header()

in source/glue/jobs/forecast_etl.py [0:0]
31 lines of code
14 McCabe index (conditional complexity)

    def _detect_header(self):
        """
        Detects a header in a CSV file in S3
        :return: True or False (header present or not present)
        """
        if not self.source.endswith(".csv"):
            source = self._get_export_path()
        else:
            source = self.source

        bucket, key = self._split_s3_url(source)
        obj = self.s3_cli.get_object(Bucket=bucket, Key=key)

        # read up to 100 lines of the .csv file
        # take only non-empty lines as a sample
        sample = ""
        for index, line in zip(range(100), obj["Body"].iter_lines()):
            if line:
                sample += line.decode() + "\n"

        # try manual detection (best for small datasets) - all header fields present
        first_line = sample.splitlines()[0]
        fields = first_line.split(",")
        if sorted(list(fields)) == sorted(list(self.schema.fields)):
            logger.info("%s header present" % (self.source))
            return True

        # try manual detection (best for small datasets) - no header fields present
        if not any(field in list(self.schema.fields) for field in fields):
            logger.info("%s header absent" % (self.source))
            return False

        # try auto detection if manual detection didn't work - this works well for larger files
        try:
            has_header = csv.Sniffer().has_header(sample)
            logger.info(
                "%s header %s" % (self.source, "present" if has_header else "absent")
            )
        except csv.Error:
            # often caused by not being able to determine the delimiter - we can assume there is no header - it will be
            # filtered out by glue transforms and joins, so this is not a concern.
            has_header = False
            logger.warning(
                "%s has input data quality issues please verify your data set quality"
                % self.source
            )

        return has_header