void register_data

void register_data_readers()

in src/mlio-py/mlio/core/data_reader.cc [253:676]
400 lines of code
10 McCabe index (conditional complexity)

void register_data_readers(py::module &m)
{
    py::enum_<Last_example_handling>(
        m,
        "LastExampleHandling",
        "Specifies how the last ``Example`` read from a dataset should to be "
        "handled if the dataset size is not evenly divisible by the batch "
        "size.")
        .value("NONE",
               Last_example_handling::none,
               "Return an ``Example`` where the size of the batch dimension is "
               "less than the requested batch size.")
        .value("DROP", Last_example_handling::drop, "Drop the last ``Example``.")
        .value("DROP_WARN", Last_example_handling::drop_warn, "Drop the last ``Example`` and warn.")
        .value("PAD",
               Last_example_handling::pad,
               "Pad the feature tensors with zero so that the size of the batch "
               "dimension equals the requested batch size.")
        .value("PAD_WARN",
               Last_example_handling::pad_warn,
               "Pad the feature tensors with zero so that the size of the batch "
               "dimension equals the requested batch size and warn.");

    py::enum_<Bad_example_handling>(
        m,
        "BadExampleHandling",
        "Specifies how an ``Example`` that contains erroneous data should be"
        "handled.")
        .value("ERROR", Bad_example_handling::error, "Raise an error.")
        .value("SKIP", Bad_example_handling::skip, "Skip the ``Example``.")
        .value("SKIP_WARN", Bad_example_handling::skip_warn, "Skip the ``Example`` and warn.")
        .value("PAD",
               Bad_example_handling::pad,
               "Skip bad instances, pad the ``Example`` to the batch size.")
        .value("PAD_WARN",
               Bad_example_handling::pad_warn,
               "Skip bad instances, pad the ``Example`` to the batch size, and warn.");

    py::enum_<Max_field_length_handling>(
        m,
        "MaxFieldLengthHandling",
        "Specifies how field and columns should be handled when breached.")
        .value("TREAT_AS_BAD",
               Max_field_length_handling::treat_as_bad,
               "Treat the corresponding row as bad.")
        .value("TRUNCATE", Max_field_length_handling::truncate, "Truncate the field.")
        .value("TRUNCATE_WARN",
               Max_field_length_handling::truncate_warn,
               "Truncate the field and warn.");

    py::enum_<Image_frame>(m, "ImageFrame", "Specifies the Image_frame parameter value")
        .value("NONE", Image_frame::none, "none.")
        .value("RECORDIO", Image_frame::recordio, "For recordio files.");

    py::class_<Py_data_iterator>(m, "DataIterator")
        .def("__iter__",
             [](Py_data_iterator &it) -> Py_data_iterator & {
                 return it;
             })
        .def("__next__", &Py_data_iterator::next);

    py::class_<Data_reader_params>(
        m, "DataReaderParams", "Represents the common parameters of a ``Data_reader`` object.")
        .def(py::init(&make_data_reader_params),
             "dataset"_a,
             "batch_size"_a,
             "num_prefetched_examples"_a = 0,
             "num_parallel_reads"_a = 0,
             "last_example_handling"_a = Last_example_handling::none,
             "bad_example_handling"_a = Bad_example_handling::error,
             "warn_bad_instances"_a = false,
             "num_instances_to_skip"_a = 0,
             "num_instances_to_read"_a = std::nullopt,
             "shard_index"_a = 0,
             "num_shards"_a = 0,
             "sample_ratio"_a = std::nullopt,
             "shuffle_instances"_a = false,
             "shuffle_window"_a = 0,
             "shuffle_seed"_a = std::nullopt,
             "reshuffle_each_epoch"_a = true,
             R"(
            Parameters
            ----------
            dataset : list of DataStores
                A list of ``DataStore`` instances that together form the
                dataset to read from.
            batch_size : int
                A number indicating how many data instances should be packed
                into a single ``Example``.
            num_prefetched_examples : int, optional
                The number of examples to prefetch in background to accelerate
                reading. If zero, default to the number of processor cores.
            num_parallel_reads : int, optional
                The number of parallel reads. If not specified, it equals
                to `num_prefetched_examples`. In case a large number of examples
                should be prefetched, this parameter can be used to avoid
                thread oversubscription.
            last_example_handling : LastExampleHandling
                See ``LastExampleHandling``.
            bad_example_handling : BadExampleHandling
                See ``BadExampleHandling``.
            warn_bad_instances : bool, optional
                A boolean value indicating whether a warning will be output for
                each bad Instance.
            num_instances_to_skip : int, optional
                The number of data instances to skip from the beginning of the
                dataset.
            num_instances_to_read : int, optional
                The number of data instances to read. The rest of the dataset
                will be ignored.
            shard_index : int, optional
                The index of the shard to read.
            num_shards : int, optional
                The number of shards the dataset should be split into. The
                reader will only read 1/num_shards of the dataset.
            sample_ratio : float, optional
                A ratio between zero and one indicating how much of the dataset
                should be read. The dataset will be sampled based on this
                number.
            shuffle_instances : bool
                A boolean value indicating whether to shuffle the data instances
                while reading from the dataset.
            shuffle_window : int
                The number of data instances to buffer and sample from. The
                selected data instances will be replaced with new data instances
                read from the dataset.

                A value of zero means perfect shuffling and requires loading the
                whole dataset into memory first.
            shuffle_seed : int, optional
                The seed that will be used for initializing the sampling
                distribution. If not specified, a random seed will be generated
                internally.
            reshuffle_each_epoch : bool, optional
                A boolean value indicating whether the dataset should be
                reshuffled after every `Data_reader.reset()` call.
            )")
        .def_readwrite("dataset", &Data_reader_params::dataset)
        .def_readwrite("batch_size", &Data_reader_params::batch_size)
        .def_readwrite("num_prefetched_examples", &Data_reader_params::num_prefetched_examples)
        .def_readwrite("num_parallel_reads", &Data_reader_params::num_parallel_reads)
        .def_readwrite("last_example_handling", &Data_reader_params::last_example_handling)
        .def_readwrite("bad_example_handling", &Data_reader_params::bad_example_handling)
        .def_readwrite("num_instances_to_skip", &Data_reader_params::num_instances_to_skip)
        .def_readwrite("num_instances_to_read", &Data_reader_params::num_instances_to_read)
        .def_readwrite("shard_index", &Data_reader_params::shard_index)
        .def_readwrite("num_shards", &Data_reader_params::num_shards)
        .def_readwrite("sample_ratio", &Data_reader_params::sample_ratio)
        .def_readwrite("shuffle_instances", &Data_reader_params::shuffle_instances)
        .def_readwrite("shuffle_window", &Data_reader_params::shuffle_window)
        .def_readwrite("shuffle_seed", &Data_reader_params::shuffle_seed)
        .def_readwrite("reshuffle_each_epoch", &Data_reader_params::reshuffle_each_epoch);

    py::class_<Csv_params>(
        m, "CsvParams", "Represents the optional parameters of a ``CsvReader`` object.")
        .def(py::init(&make_csv_reader_params),
             "column_names"_a = std::vector<std::string>{},
             "name_prefix"_a = "",
             "use_columns"_a = std::unordered_set<std::string>{},
             "use_columns_by_index"_a = std::unordered_set<std::size_t>{},
             "default_data_type"_a = std::nullopt,
             "column_types"_a = std::unordered_map<std::string, Data_type>{},
             "column_types_by_index"_a = std::unordered_map<std::size_t, Data_type>{},
             "header_row_index"_a = 0,
             "has_single_header"_a = false,
             "dedupe_column_names"_a = true,
             "delimiter"_a = ',',
             "quote_char"_a = '"',
             "comment_char"_a = std::nullopt,
             "allow_quoted_new_lines"_a = false,
             "skip_blank_lines"_a = true,
             "encoding"_a = std::nullopt,
             "max_field_length"_a = std::nullopt,
             "max_field_length_handling"_a = Max_field_length_handling::treat_as_bad,
             "max_line_length"_a = std::nullopt,
             "parser_options"_a = std::nullopt,
             R"(
            Parameters
            ----------
            column_names : list of strs
                The column names.

                If the dataset has a header and `header_row_index` is specified,
                this list can be left empty to infer the column names from the
                dataset.

                Due to a shortcoming in pybind11, values cannot be added to
                container types, and updates must instead be made via
                assignment.
            name_prefix : str
                The prefix to prepend to column names.
            use_columns : list of strs
                The columns that should be read. The rest of the columns will
                be skipped.

                Due to a shortcoming in pybind11, values cannot be added to
                container types, and updates must instead be made via
                assignment.
            use_columns_by_index : list of ints
                The columns, specified by index, that should be read. The rest
                of the columns will be skipped.

                Due to a shortcoming in pybind11, values cannot be added to
                container types, and updates must instead be made via
                assignment.
            default_data_type : DataType
                The data type for columns for which no explicit data type is
                specified via `column_types` or `column_types_by_index`. If not
                specified, the column data types will be inferred from the
                dataset.
            column_types : map of str/data type
                The mapping between columns and data types by name.

                Due to a shortcoming in pybind11, values cannot be added to
                container types, and updates must instead be made via
                assignment.
            column_types_by_index : map of str/int
                The mapping between columns and data types by index.

                Due to a shortcoming in pybind11, values cannot be added to
                container types, and updates must instead be made via
                assignment.
            header_row_index : int, optional
                The index of the row that should be treated as the header of the
                dataset. If `column_names` is empty, the column names will be
                inferred from that row.  If neither `header_row_index` nor
                `column_names` is specified, the column ordinal positions
                will be used as column names.

                Each data store in the dataset should have its header at the
                same index.
            has_single_header : bool, optional
                A boolean value indicating whether the dataset has a header row
                only in the first data store.
            dedupe_column_names: bool, optional
                A boolean value indicating whether duplicate columns should be
                renamed. If true, duplicate columns 'X', ..., 'X' will be
                renamed to 'X', 'X_1', X_2', ...
            delimiter : char
                The delimiter character.
            quote_char : char
                The character used for quoting field values.
            comment_char : char, optional
                The comment character. Lines that start with the comment
                character will be skipped.
            allow_quoted_new_lines : bool
                A boolean value indicating whether quoted fields can be multi-
                line. Note that turning this flag on can slow down the reading
                speed.
            skip_blank_lines : bool
                A boolean value indicating whether to skip empty lines.
            encoding : str, optional
                The text encoding to use for reading. If not specified, it will
                be inferred from the preamble of the text; otherwise falls back
                to UTF-8.
            max_field_length : int, optional
                The maximum number of characters that will be read in a field.
                Any characters beyond this limit will be handled using the
                strategy in `Max_field_length_handling`.
            max_field_length_handling : MaxFieldLengthHandling, optional
                See ``MaxFieldLengthHandling``.
            max_line_length : int, optional
                The maximum size of a text line. If a row is longer than the
                specified size, an error will be raised.
            parser_options : ParserParams, optional
                See ``ParserParams``.
            )")
        .def_readwrite("column_names", &Csv_params::column_names)
        .def_readwrite("name_prefix", &Csv_params::name_prefix)
        .def_readwrite("use_columns", &Csv_params::use_columns)
        .def_readwrite("use_columns_by_index", &Csv_params::use_columns_by_index)
        .def_readwrite("default_data_type", &Csv_params::default_data_type)
        .def_readwrite("column_types", &Csv_params::column_types)
        .def_readwrite("column_types_by_index", &Csv_params::column_types_by_index)
        .def_readwrite("header_row_index", &Csv_params::header_row_index)
        .def_readwrite("has_single_header", &Csv_params::has_single_header)
        .def_readwrite("dedupe_column_names", &Csv_params::dedupe_column_names)
        .def_readwrite("delimiter", &Csv_params::delimiter)
        .def_readwrite("quote_char", &Csv_params::quote_char)
        .def_readwrite("comment_char", &Csv_params::comment_char)
        .def_readwrite("allow_quoted_new_lines", &Csv_params::allow_quoted_new_lines)
        .def_readwrite("skip_blank_lines", &Csv_params::skip_blank_lines)
        .def_readwrite("encoding", &Csv_params::encoding)
        .def_readwrite("max_field_length", &Csv_params::max_field_length)
        .def_readwrite("max_field_length_handling", &Csv_params::max_field_length_handling)
        .def_readwrite("max_line_length", &Csv_params::max_line_length)
        .def_readwrite("parser_options", &Csv_params::parser_options);

    py::class_<Image_reader_params>(
        m, "ImageReaderParams", "Represents the optional parameters of an ``ImageReader`` object.")
        .def(py::init(&make_image_reader_params),
             "image_frame"_a = Image_frame::none,
             "resize"_a = std::nullopt,
             "image_dimensions"_a = std::nullopt,
             "to_rgb"_a = false,
             R"(
            Parameters
            ----------
            image_frame : enum {NONE, RECORDIO}
                Selects the image frame to NONE(for raw image files) or
                RECORDIO(for recordio files).
            resize : int, optional
                Scales the shorter edge to a new size before applying other
                augmentations.
            image_dimensions : list of ints
                The dimensions of output image in (channels, height, width)
                format.
            to_rgb : boolean
                Converts from BGR (OpenCV default) to RGB, if set to true.
            )")
        .def_readwrite("image_frame", &Image_reader_params::image_frame)
        .def_readwrite("resize", &Image_reader_params::resize)
        .def_readwrite("image_dimensions", &Image_reader_params::image_dimensions)
        .def_readwrite("to_rgb", &Image_reader_params::to_rgb);

    py::class_<Parser_options>(m, "ParserParams")
        .def(py::init(&make_parser_options),
             "nan_values"_a = std::unordered_set<std::string>{},
             "number_base"_a = 10,
             R"(
            Parameters
            ----------
            nan_values : list of strs
                For a floating-point parse operation holds the list of strings
                that should be treated as NaN.

                Due to a shortcoming in pybind11, values cannot be added to
                container types, and updates must instead be made via
                assignment.
            number_base : int
                For a number parse operation specifies the base of the number
                in its string representation.
             )")
        .def_readwrite("nan_values", &Parser_options::nan_values)
        .def_readwrite("number_base", &Parser_options::base);

    py::class_<Data_reader, Py_data_reader, Intrusive_ptr<Data_reader>>(
        m,
        "DataReader",
        "Represents an interface for classes that read examples from a "
        "dataset in a particular data format.")
        .def(py::init<>())
        .def("read_example",
             &Data_reader::read_example,
             py::call_guard<py::gil_scoped_release>(),
             "Returns the next ``Example`` read from the dataset. If the end "
             "of the data is reached, returns None")
        .def("peek_example",
             &Data_reader::peek_example,
             py::call_guard<py::gil_scoped_release>(),
             "Returns the next ``Example`` read from the dataset without "
             "consuming it.")
        .def("read_schema",
             &Data_reader::read_schema,
             py::call_guard<py::gil_scoped_release>(),
             "Returns the ``Schema`` of the dataset.")
        .def("reset",
             &Data_reader::reset,
             "Resets the state of the reader. Calling ``read_example()`` the "
             "next time will start reading from the beginning of the dataset.")
        .def("__iter__",
             [](py::object &reader) {
                 return Py_data_iterator(reader.cast<Data_reader &>(), reader);
             })
        .def_property_readonly("num_bytes_read",
                               &Data_reader::num_bytes_read,
                               R"(
             Gets the number of bytes read from the dataset.

             The returned number won't include the size of the discarded
             parts of the dataset such as comment blocks.

             The returned number can be greater than expected as MLIO
             reads ahead the dataset in background.)");

    py::class_<Csv_reader, Data_reader, Intrusive_ptr<Csv_reader>>(
        m, "CsvReader", "Represents a ``Data_reader`` for reading CSV datasets.")
        .def(py::init<>(&make_csv_reader),
             "data_reader_params"_a,
             "csv_params"_a = std::nullopt,
             R"(
            Parameters
            ----------
            data_reader_params : DataReaderParams
                See ``DataReaderParams``.
            csv_reader_params : CsvReaderParams, optional
                See ``CsvReaderParams``.
            )");

    py::class_<Image_reader, Data_reader, Intrusive_ptr<Image_reader>>(
        m, "ImageReader", "Represents a ``Data_reader`` for reading Image datasets.")
        .def(py::init<>(&make_image_reader),
             "data_reader_params"_a,
             "image_reader_params"_a = std::nullopt,
             R"(
            Parameters
            ----------
            data_reader_params : DataReaderParams
                See ``DataReaderParams``.
            image_reader_params : ImageReaderParams
                See ``ImageReaderParams``.
            )");

    py::class_<Recordio_protobuf_reader, Data_reader, Intrusive_ptr<Recordio_protobuf_reader>>(
        m, "RecordIOProtobufReader")
        .def(py::init<>(&make_recordio_protobuf_reader),
             "data_reader_params"_a,
             R"(
            Parameters
            ----------
            data_reader_params : DataReaderParams
                See ``DataReaderParams``.
            )");

    py::class_<Text_line_reader, Data_reader, Intrusive_ptr<Text_line_reader>>(m, "TextLineReader")
        .def(py::init<>(&make_text_line_reader),
             "data_reader_params"_a,
             R"(
            Parameters
            ----------
            data_reader_params : DataReaderParams
                See ``DataReaderParams``.
            )");
}