src/dataframe.rs

// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. use crate::physical_plan::PyExecutionPlan; use crate::sql::logical::PyLogicalPlan; use crate::utils::wait_for_future; use crate::{errors::DataFusionError, expr::PyExpr}; use datafusion::arrow::datatypes::Schema; use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; use datafusion::arrow::util::pretty; use datafusion::dataframe::DataFrame; use datafusion::prelude::*; use pyo3::exceptions::PyTypeError; use pyo3::prelude::*; use pyo3::types::PyTuple; use std::sync::Arc; /// A PyDataFrame is a representation of a logical plan and an API to compose statements. /// Use it to build a plan and `.collect()` to execute the plan and collect the result. /// The actual execution of a plan runs natively on Rust and Arrow on a multi-threaded environment. #[pyclass(name = "DataFrame", module = "datafusion", subclass)] #[derive(Clone)] pub(crate) struct PyDataFrame { df: Arc<DataFrame>, } impl PyDataFrame { /// creates a new PyDataFrame pub fn new(df: DataFrame) -> Self { Self { df: Arc::new(df) } } } #[pymethods] impl PyDataFrame { fn __getitem__(&self, key: PyObject) -> PyResult<Self> { Python::with_gil(|py| { if let Ok(key) = key.extract::<&str>(py) { self.select_columns(vec![key]) } else if let Ok(tuple) = key.extract::<&PyTuple>(py) { let keys = tuple .iter() .map(|item| item.extract::<&str>()) .collect::<PyResult<Vec<&str>>>()?; self.select_columns(keys) } else if let Ok(keys) = key.extract::<Vec<&str>>(py) { self.select_columns(keys) } else { let message = "DataFrame can only be indexed by string index or indices"; Err(PyTypeError::new_err(message)) } }) } fn __repr__(&self, py: Python) -> PyResult<String> { let df = self.df.as_ref().clone().limit(0, Some(10))?; let batches = wait_for_future(py, df.collect())?; let batches_as_string = pretty::pretty_format_batches(&batches); match batches_as_string { Ok(batch) => Ok(format!("DataFrame()\n{batch}")), Err(err) => Ok(format!("Error: {:?}", err.to_string())), } } /// Calculate summary statistics for a DataFrame fn describe(&self, py: Python) -> PyResult<Self> { let df = self.df.as_ref().clone(); let stat_df = wait_for_future(py, df.describe())?; Ok(Self::new(stat_df)) } /// Returns the schema from the logical plan fn schema(&self) -> PyArrowType<Schema> { PyArrowType(self.df.schema().into()) } #[pyo3(signature = (*args))] fn select_columns(&self, args: Vec<&str>) -> PyResult<Self> { let df = self.df.as_ref().clone().select_columns(&args)?; Ok(Self::new(df)) } #[pyo3(signature = (*args))] fn select(&self, args: Vec<PyExpr>) -> PyResult<Self> { let expr = args.into_iter().map(|e| e.into()).collect(); let df = self.df.as_ref().clone().select(expr)?; Ok(Self::new(df)) } fn filter(&self, predicate: PyExpr) -> PyResult<Self> { let df = self.df.as_ref().clone().filter(predicate.into())?; Ok(Self::new(df)) } fn with_column(&self, name: &str, expr: PyExpr) -> PyResult<Self> { let df = self.df.as_ref().clone().with_column(name, expr.into())?; Ok(Self::new(df)) } /// Rename one column by applying a new projection. This is a no-op if the column to be /// renamed does not exist. fn with_column_renamed(&self, old_name: &str, new_name: &str) -> PyResult<Self> { let df = self .df .as_ref() .clone() .with_column_renamed(old_name, new_name)?; Ok(Self::new(df)) } fn aggregate(&self, group_by: Vec<PyExpr>, aggs: Vec<PyExpr>) -> PyResult<Self> { let group_by = group_by.into_iter().map(|e| e.into()).collect(); let aggs = aggs.into_iter().map(|e| e.into()).collect(); let df = self.df.as_ref().clone().aggregate(group_by, aggs)?; Ok(Self::new(df)) } #[pyo3(signature = (*exprs))] fn sort(&self, exprs: Vec<PyExpr>) -> PyResult<Self> { let exprs = exprs.into_iter().map(|e| e.into()).collect(); let df = self.df.as_ref().clone().sort(exprs)?; Ok(Self::new(df)) } #[pyo3(signature = (count, offset=0))] fn limit(&self, count: usize, offset: usize) -> PyResult<Self> { let df = self.df.as_ref().clone().limit(offset, Some(count))?; Ok(Self::new(df)) } /// Executes the plan, returning a list of `RecordBatch`es. /// Unless some order is specified in the plan, there is no /// guarantee of the order of the result. fn collect(&self, py: Python) -> PyResult<Vec<PyObject>> { let batches = wait_for_future(py, self.df.as_ref().clone().collect())?; // cannot use PyResult<Vec<RecordBatch>> return type due to // https://github.com/PyO3/pyo3/issues/1813 batches.into_iter().map(|rb| rb.to_pyarrow(py)).collect() } /// Cache DataFrame. fn cache(&self, py: Python) -> PyResult<Self> { let df = wait_for_future(py, self.df.as_ref().clone().cache())?; Ok(Self::new(df)) } /// Executes this DataFrame and collects all results into a vector of vector of RecordBatch /// maintaining the input partitioning. fn collect_partitioned(&self, py: Python) -> PyResult<Vec<Vec<PyObject>>> { let batches = wait_for_future(py, self.df.as_ref().clone().collect_partitioned())?; batches .into_iter() .map(|rbs| rbs.into_iter().map(|rb| rb.to_pyarrow(py)).collect()) .collect() } /// Print the result, 20 lines by default #[pyo3(signature = (num=20))] fn show(&self, py: Python, num: usize) -> PyResult<()> { let df = self.df.as_ref().clone().limit(0, Some(num))?; print_dataframe(py, df) } /// Filter out duplicate rows fn distinct(&self) -> PyResult<Self> { let df = self.df.as_ref().clone().distinct()?; Ok(Self::new(df)) } fn join( &self, right: PyDataFrame, join_keys: (Vec<&str>, Vec<&str>), how: &str, ) -> PyResult<Self> { let join_type = match how { "inner" => JoinType::Inner, "left" => JoinType::Left, "right" => JoinType::Right, "full" => JoinType::Full, "semi" => JoinType::LeftSemi, "anti" => JoinType::LeftAnti, how => { return Err(DataFusionError::Common(format!( "The join type {how} does not exist or is not implemented" )) .into()); } }; let df = self.df.as_ref().clone().join( right.df.as_ref().clone(), join_type, &join_keys.0, &join_keys.1, None, )?; Ok(Self::new(df)) } /// Print the query plan #[pyo3(signature = (verbose=false, analyze=false))] fn explain(&self, py: Python, verbose: bool, analyze: bool) -> PyResult<()> { let df = self.df.as_ref().clone().explain(verbose, analyze)?; print_dataframe(py, df) } /// Get the logical plan for this `DataFrame` fn logical_plan(&self) -> PyResult<PyLogicalPlan> { Ok(self.df.as_ref().clone().logical_plan().clone().into()) } /// Get the optimized logical plan for this `DataFrame` fn optimized_logical_plan(&self) -> PyResult<PyLogicalPlan> { Ok(self.df.as_ref().clone().into_optimized_plan()?.into()) } /// Get the execution plan for this `DataFrame` fn execution_plan(&self, py: Python) -> PyResult<PyExecutionPlan> { let plan = wait_for_future(py, self.df.as_ref().clone().create_physical_plan())?; Ok(plan.into()) } /// Repartition a `DataFrame` based on a logical partitioning scheme. fn repartition(&self, num: usize) -> PyResult<Self> { let new_df = self .df .as_ref() .clone() .repartition(Partitioning::RoundRobinBatch(num))?; Ok(Self::new(new_df)) } /// Repartition a `DataFrame` based on a logical partitioning scheme. #[pyo3(signature = (*args, num))] fn repartition_by_hash(&self, args: Vec<PyExpr>, num: usize) -> PyResult<Self> { let expr = args.into_iter().map(|py_expr| py_expr.into()).collect(); let new_df = self .df .as_ref() .clone() .repartition(Partitioning::Hash(expr, num))?; Ok(Self::new(new_df)) } /// Calculate the union of two `DataFrame`s, preserving duplicate rows.The /// two `DataFrame`s must have exactly the same schema #[pyo3(signature = (py_df, distinct=false))] fn union(&self, py_df: PyDataFrame, distinct: bool) -> PyResult<Self> { let new_df = if distinct { self.df .as_ref() .clone() .union_distinct(py_df.df.as_ref().clone())? } else { self.df.as_ref().clone().union(py_df.df.as_ref().clone())? }; Ok(Self::new(new_df)) } /// Calculate the distinct union of two `DataFrame`s. The /// two `DataFrame`s must have exactly the same schema fn union_distinct(&self, py_df: PyDataFrame) -> PyResult<Self> { let new_df = self .df .as_ref() .clone() .union_distinct(py_df.df.as_ref().clone())?; Ok(Self::new(new_df)) } /// Calculate the intersection of two `DataFrame`s. The two `DataFrame`s must have exactly the same schema fn intersect(&self, py_df: PyDataFrame) -> PyResult<Self> { let new_df = self .df .as_ref() .clone() .intersect(py_df.df.as_ref().clone())?; Ok(Self::new(new_df)) } /// Calculate the exception of two `DataFrame`s. The two `DataFrame`s must have exactly the same schema fn except_all(&self, py_df: PyDataFrame) -> PyResult<Self> { let new_df = self.df.as_ref().clone().except(py_df.df.as_ref().clone())?; Ok(Self::new(new_df)) } /// Write a `DataFrame` to a CSV file. fn write_csv(&self, path: &str, py: Python) -> PyResult<()> { wait_for_future(py, self.df.as_ref().clone().write_csv(path))?; Ok(()) } /// Write a `DataFrame` to a Parquet file. fn write_parquet(&self, path: &str, py: Python) -> PyResult<()> { wait_for_future(py, self.df.as_ref().clone().write_parquet(path, None))?; Ok(()) } /// Executes a query and writes the results to a partitioned JSON file. fn write_json(&self, path: &str, py: Python) -> PyResult<()> { wait_for_future(py, self.df.as_ref().clone().write_json(path))?; Ok(()) } /// Convert to Arrow Table /// Collect the batches and pass to Arrow Table fn to_arrow_table(&self, py: Python) -> PyResult<PyObject> { let batches = self.collect(py)?.to_object(py); let schema: PyObject = self.schema().into_py(py); Python::with_gil(|py| { // Instantiate pyarrow Table object and use its from_batches method let table_class = py.import("pyarrow")?.getattr("Table")?; let args = PyTuple::new(py, &[batches, schema]); let table: PyObject = table_class.call_method1("from_batches", args)?.into(); Ok(table) }) } /// Convert to pandas dataframe with pyarrow /// Collect the batches, pass to Arrow Table & then convert to Pandas DataFrame fn to_pandas(&self, py: Python) -> PyResult<PyObject> { let table = self.to_arrow_table(py)?; Python::with_gil(|py| { // See also: https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.to_pandas let result = table.call_method0(py, "to_pandas")?; Ok(result) }) } /// Convert to Python list using pyarrow /// Each list item represents one row encoded as dictionary fn to_pylist(&self, py: Python) -> PyResult<PyObject> { let table = self.to_arrow_table(py)?; Python::with_gil(|py| { // See also: https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.to_pylist let result = table.call_method0(py, "to_pylist")?; Ok(result) }) } /// Convert to Python dictionary using pyarrow /// Each dictionary key is a column and the dictionary value represents the column values fn to_pydict(&self, py: Python) -> PyResult<PyObject> { let table = self.to_arrow_table(py)?; Python::with_gil(|py| { // See also: https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.to_pydict let result = table.call_method0(py, "to_pydict")?; Ok(result) }) } /// Convert to polars dataframe with pyarrow /// Collect the batches, pass to Arrow Table & then convert to polars DataFrame fn to_polars(&self, py: Python) -> PyResult<PyObject> { let table = self.to_arrow_table(py)?; Python::with_gil(|py| { let dataframe = py.import("polars")?.getattr("DataFrame")?; let args = PyTuple::new(py, &[table]); let result: PyObject = dataframe.call1(args)?.into(); Ok(result) }) } // Executes this DataFrame to get the total number of rows. fn count(&self, py: Python) -> PyResult<usize> { Ok(wait_for_future(py, self.df.as_ref().clone().count())?) } } /// Print DataFrame fn print_dataframe(py: Python, df: DataFrame) -> PyResult<()> { // Get string representation of record batches let batches = wait_for_future(py, df.collect())?; let batches_as_string = pretty::pretty_format_batches(&batches); let result = match batches_as_string { Ok(batch) => format!("DataFrame()\n{batch}"), Err(err) => format!("Error: {:?}", err.to_string()), }; // Import the Python 'builtins' module to access the print function // Note that println! does not print to the Python debug console and is not visible in notebooks for instance let print = py.import("builtins")?.getattr("print")?; print.call1((result,))?; Ok(()) }

src/dataframe.rs (289 lines of code) (raw):