Project-BasicAlgorithm/core/data.py (39 lines of code) (raw):
# Licensed to Apache Software Foundation (ASF) under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Apache Software Foundation (ASF) licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import os
import pandas as pd
from sklearn.model_selection import train_test_split
PATH_ERROR_MESSAGE = (
"data_path only support csv data or directory contained train.csv and test.csv"
)
def load_data(data_path, label_column, test_size=0.25, random_state=1):
if os.path.isdir(data_path):
train_path = os.path.join(data_path, "train.csv")
test_path = os.path.join(data_path, "test.csv")
assert os.path.exists(train_path) and os.path.exists(
test_path
), PATH_ERROR_MESSAGE
print(f"load train data from {train_path}")
print(f"load test data from {test_path}")
train_x, train_y = load_csv_data(train_path, label_column)
test_x, test_y = load_csv_data(test_path, label_column)
elif data_path.endswith(".csv"):
print(f"load data from {data_path}")
print("split data to train set and test set")
train_x, train_y, test_x, test_y = load_split_csv_data(
data_path, label_column, test_size=test_size, random_state=random_state
)
else:
raise Exception(PATH_ERROR_MESSAGE)
return train_x, train_y, test_x, test_y
def load_split_csv_data(data_path, label_column, test_size=0.25, random_state=1):
data = pd.read_csv(data_path)
train, test = train_test_split(data, test_size=test_size, random_state=random_state)
train_x = train.drop([label_column], axis=1)
test_x = test.drop([label_column], axis=1)
train_y = train[[label_column]]
test_y = test[[label_column]]
return train_x, train_y, test_x, test_y
def load_csv_data(data_path, label_column):
data = pd.read_csv(data_path)
x = data.drop([label_column], axis=1)
y = data[[label_column]]
return x, y