tzrec/protos/data.proto (101 lines of code) (raw):

syntax = "proto2"; package tzrec.protos; import "tzrec/protos/sampler.proto"; enum DatasetType { OdpsDataset = 1; ParquetDataset = 2; CsvDataset = 3; OdpsDatasetV1 = 4; } enum FieldType { INT32 = 0; INT64 = 1; STRING = 2; FLOAT = 4; DOUBLE = 5; } enum FgMode { // input data is feature generate encoded, // we do not do fg FG_NONE = 1; // input data is raw feature, // we use python to run feature generate FG_NORMAL = 2; // input data is raw feature, // we use fg_handler to run feature generate FG_DAG = 3; // input data is after feature generate but before do bucketize, // we do bucketize only FG_BUCKETIZE = 4; } message Field { required string input_name = 1; // only need specify it when use CsvDataset and // value dtype can not be inferred (all values in the column are null) optional FieldType input_type = 2; } message DataConfig { // mini batch size to use for training and evaluation. optional uint32 batch_size = 1 [default = 1024]; // dataset type. required DatasetType dataset_type = 2 [default = OdpsDataset]; // [deprecated] please use fg_mode. // input data is feature generate encoded or not. // if fg_encoded = true, you should do fg offline first, // and set fg_encoded_multival_sep for split multi-val feature optional bool fg_encoded = 3 [default = true]; // separator for multi-val feature in fg encoded input data optional string fg_encoded_multival_sep = 4 [default = '\x03']; // labels repeated string label_fields = 5; // number of workers for parallel processing raw data optional uint32 num_workers = 6 [default = 8]; // pin memory for fast cudaMemCopy optional bool pin_memory = 7 [default = true]; // the input fields must be the same number and in the // same order as data in csv files repeated Field input_fields = 8; // delimiter of column features, only used for CsvDataset optional string delimiter = 9 [default = ',']; // for csv files, with header or not. optional bool with_header = 10 [default = false]; // mini batch size to use for and evaluation. optional uint32 eval_batch_size = 11; // drop last batch less than batch_size optional bool drop_remainder = 12 [default = false]; // fg threads for each worker, // if fg_threads = 0, will disable fg dag handler, use python run. optional uint32 fg_threads = 13 [default = 1]; // when use OdpsDataset, read data orderby table partitions or not. optional bool is_orderby_partition = 14 [default = false]; // maxcompute storage api & tunnel quota name optional string odps_data_quota_name = 15 [default = "pay-as-you-go"]; // mask probability for samples in training progress optional float sample_mask_prob = 16 [default = 0.0]; // mask probability for sampled negatives in training progress optional float negative_sample_mask_prob = 17 [default = 0.0]; // force padding data into same data group with same batch_size optional bool force_base_data_group = 18 [default = false]; // sample weights repeated string sample_weight_fields = 19; // fg run mode. optional FgMode fg_mode = 20 [default = FG_NONE]; // hstu enable optional bool enable_hstu = 21 [default = false]; // whether to shuffle data optional bool shuffle = 22 [default = false]; // shufffle buffer for better performance, even shuffle buffer is set, // it is suggested to do full data shuffle before training // especially when the performance of models is not good. optional uint32 shuffle_buffer_size = 23 [default = 32]; // maxcompute storage api data compression type, LZ4_FRAME | ZSTD | UNCOMPRESSED optional string odps_data_compression = 24 [default = "LZ4_FRAME"]; // negative sampler oneof sampler { NegativeSampler negative_sampler = 101; NegativeSamplerV2 negative_sampler_v2 = 102; HardNegativeSampler hard_negative_sampler = 103; HardNegativeSamplerV2 hard_negative_sampler_v2 = 104; TDMSampler tdm_sampler = 105; } }