smallpond/logical/node.py [571:622]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        parquet_row_group_size: int = None,
        parquet_dictionary_encoding=False,
        parquet_compression="ZSTD",
        parquet_compression_level=3,
        use_duckdb_reader=False,
        output_name: str = None,
        output_path: str = None,
        cpu_limit: int = 1,
        gpu_limit: float = 0,
        memory_limit: Optional[int] = None,
    ) -> None:
        """
        Construct a ArrowComputeNode. See :func:`Node.__init__` to find comments on other parameters.

        Parameters
        ----------
        process_func, optional
            User-defined process function, which should have the same signature as `self.process(...)`.
            If user-defined function has extra parameters, use `functools.partial(...)` to bind arguments.
            See `test_partial_process_func` in `test/test_execution.py` for examples of usage.
        parquet_row_group_size, optional
            The number of rows stored in each row group of parquet file.
            Large row group size provides more opportunities to compress the data.
            Small row groups size could make filtering rows faster and achieve high concurrency.
            See https://duckdb.org/docs/data/parquet/tips.html#selecting-a-row_group_size.
        parquet_dictionary_encoding, optional
            Specify if we should use dictionary encoding in general or only for some columns.
            See `use_dictionary` in https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html.
        use_duckdb_reader, optional
            Use duckdb (instead of pyarrow parquet module) to load parquet files as arrow table.
        cpu_limit, optional
            The max number of CPUs would be used by tasks generated from this node.
            This is a resource requirement specified by the user and used to guide
            task scheduling. smallpond does NOT enforce this limit.
        gpu_limit, optional
            The max number of GPUs would be used by tasks generated from this node.
            This is a resource requirement specified by the user and used to guide
            task scheduling. smallpond does NOT enforce this limit.
        memory_limit, optional
            The max memory would be used by tasks generated from this node.
            This is a resource requirement specified by the user and used to guide
            task scheduling. smallpond does NOT enforce this limit.
        """
        super().__init__(
            ctx,
            input_deps,
            output_name,
            output_path,
            cpu_limit,
            gpu_limit,
            memory_limit,
        )
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



smallpond/logical/node.py [726:783]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        parquet_row_group_size: int = None,
        parquet_dictionary_encoding=False,
        parquet_compression="ZSTD",
        parquet_compression_level=3,
        use_duckdb_reader=False,
        output_name: str = None,
        output_path: str = None,
        cpu_limit: int = 1,
        gpu_limit: float = 0,
        memory_limit: Optional[int] = None,
    ) -> None:
        """
        Construct a ArrowStreamNode. See :func:`Node.__init__` to find comments on other parameters.

        Parameters
        ----------
        process_func, optional
            User-defined process function, which should have the same signature as `self.process(...)`.
            If user-defined function has extra parameters, use `functools.partial(...)` to bind arguments.
            See `test_partial_process_func` in `test/test_execution.py` for examples of usage.
        background_io_thread, optional
            Create a background IO thread for read/write.
        streaming_batch_size, optional
            Split the input datasets into batches, each of which has length less or equal to `streaming_batch_size`.
        secs_checkpoint_interval, optional
            Create a checkpoint of the stream task every `secs_checkpoint_interval` seconds.
        parquet_row_group_size, optional
            The number of rows stored in each row group of parquet file.
            Large row group size provides more opportunities to compress the data.
            Small row groups size could make filtering rows faster and achieve high concurrency.
            See https://duckdb.org/docs/data/parquet/tips.html#selecting-a-row_group_size.
        parquet_dictionary_encoding, optional
            Specify if we should use dictionary encoding in general or only for some columns.
            See `use_dictionary` in https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html.
        use_duckdb_reader, optional
            Use duckdb (instead of pyarrow parquet module) to load parquet files as arrow table.
        cpu_limit, optional
            The max number of CPUs would be used by tasks generated from this node.
            This is a resource requirement specified by the user and used to guide
            task scheduling. smallpond does NOT enforce this limit.
        gpu_limit, optional
            The max number of GPUs would be used by tasks generated from this node.
            This is a resource requirement specified by the user and used to guide
            task scheduling. smallpond does NOT enforce this limit.
        memory_limit, optional
            The max memory would be used by tasks generated from this node.
            This is a resource requirement specified by the user and used to guide
            task scheduling. smallpond does NOT enforce this limit.
        """
        super().__init__(
            ctx,
            input_deps,
            output_name,
            output_path,
            cpu_limit,
            gpu_limit,
            memory_limit,
        )
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



