packages/core/src/dataset-pool/format/csv.ts (55 lines of code) (raw):
import * as DataCook from '@pipcook/datacook';
import * as Papaparse from 'papaparse';
import { ArrayDatasetPoolImpl, Types } from '..';
import Csv = DataCook.Dataset.Types.Csv;
export interface Options {
trainData?: string;
testData?: string;
validData?: string;
predictedData?: string;
hasHeader: boolean;
delimiter?: string;
labels?: string[];
}
function toSamples(
parsedData: Papaparse.ParseResult<Record<string, string>>,
labelFields?: Array<string>
): Array<Csv.Sample> {
return parsedData.data.map((data) => {
const label: Record<string, string> = {};
const newData = { ...data };
labelFields?.forEach((field) => {
label[field] = newData[field];
delete newData[field];
});
return {
data: newData,
label
};
});
}
export const makeDatasetPoolFromCsv = (options: Options): Types.DatasetPool<Csv.Sample, Types.Csv.DatasetMeta> => {
const config = {
header: options.hasHeader, delimiter: options.delimiter
};
const parsedTrainData = options.trainData ? Papaparse.parse<Record<string, string>>(options.trainData, config) : undefined;
const parsedTestData = options.testData ? Papaparse.parse<Record<string, string>>(options.testData, config) : undefined;
const parsedValidData = options.validData ? Papaparse.parse<Record<string, string>>(options.validData, config) : undefined;
const parsedPredictedData = options.predictedData ? Papaparse.parse<Record<string, string>>(options.predictedData, config) : undefined;
const data = {
trainData: parsedTrainData ? toSamples(parsedTrainData, options.labels) : undefined,
testData: parsedTestData ? toSamples(parsedTestData, options.labels) : undefined,
validData: parsedValidData ? toSamples(parsedValidData, options.labels) : undefined,
predictedData: parsedPredictedData ? toSamples(parsedPredictedData, options.labels) : undefined
};
const meta: Types.Csv.DatasetMeta = {
type: DataCook.Dataset.Types.DatasetType.Table,
size: {
train: data.trainData?.length || 0,
test: data.testData?.length || 0,
valid: data.validData?.length || 0,
predicted: data.predictedData?.length || 0
}
};
return ArrayDatasetPoolImpl.from(data, meta);
};