easy_rec/python/protos/dataset.proto

syntax = "proto2"; package protos; // Weighted Random Sampling ItemID not in Batch message NegativeSampler { // sample data path // itemid weight attrs required string input_path = 1; // number of negative sample required uint32 num_sample = 2; // field names of attrs in train data or eval data repeated string attr_fields = 3; // field name of item_id in train data or eval data required string item_id_field = 4; optional string attr_delimiter = 5 [default=":"]; optional uint32 num_eval_sample = 6 [default=0]; // only works on DataScience/Local optional string field_delimiter = 7 [default="\001"]; } message NegativeSamplerInMemory { // sample data path // itemid weight attrs required string input_path = 1; // number of negative sample required uint32 num_sample = 2; // field names of attrs in train data or eval data repeated string attr_fields = 3; // field name of item_id in train data or eval data required string item_id_field = 4; optional string attr_delimiter = 5 [default=":"]; optional uint32 num_eval_sample = 6 [default=0]; // only works on DataScience/Local optional string field_delimiter = 7 [default="\001"]; } // Weighted Random Sampling ItemID not with Edge message NegativeSamplerV2 { // user data path // userid weight required string user_input_path = 1; // item data path // itemid weight attrs required string item_input_path = 2; // positive edge path // userid itemid weight required string pos_edge_input_path = 3; // number of negative sample required uint32 num_sample = 4; // field names of attrs in train data or eval data repeated string attr_fields = 5; // field name of item_id in train data or eval data required string item_id_field = 6; // field name of user_id in train data or eval data required string user_id_field = 7; optional string attr_delimiter = 8 [default=":"]; optional uint32 num_eval_sample = 9 [default=0]; // only works on DataScience/Local optional string field_delimiter = 10 [default="\001"]; } // Weighted Random Sampling ItemID not in Batch and Sampling Hard Edge message HardNegativeSampler { // user data path // userid weight required string user_input_path = 1; // item data path // itemid weight attrs required string item_input_path = 2; // hard negative edge path // userid itemid weight required string hard_neg_edge_input_path = 3; // number of negative sample required uint32 num_sample = 4; // max number of hard negative sample required uint32 num_hard_sample = 5; // field names of attrs in train data or eval data repeated string attr_fields = 6; // field name of item_id in train data or eval data required string item_id_field = 7; // field name of user_id in train data or eval data required string user_id_field = 8; optional string attr_delimiter = 9 [default=":"]; optional uint32 num_eval_sample = 10 [default=0]; // only works on DataScience/Local optional string field_delimiter = 11 [default="\001"]; } // Weighted Random Sampling ItemID not with Edge and Sampling Hard Edge message HardNegativeSamplerV2 { // user data path // userid weight required string user_input_path = 1; // item data path // itemid weight attrs required string item_input_path = 2; // positive edge path // userid itemid weight required string pos_edge_input_path = 3; // hard negative edge path // userid itemid weight required string hard_neg_edge_input_path = 4; // number of negative sample required uint32 num_sample = 5; // max number of hard negative sample required uint32 num_hard_sample = 6; // field names of attrs in train data or eval data repeated string attr_fields = 7; // field name of item_id in train data or eval data required string item_id_field = 8; // field name of user_id in train data or eval data required string user_id_field = 9; optional string attr_delimiter = 10 [default=":"]; optional uint32 num_eval_sample = 11 [default=0]; // only works on DataScience/Local optional string field_delimiter = 12 [default="\001"]; } message DatasetConfig { // mini batch size to use for training and evaluation. optional uint32 batch_size = 1 [default = 32]; enum FieldType { INT32 = 0; INT64 = 1; STRING = 2; FLOAT = 4; DOUBLE = 5; BOOL = 6; } message Field { required string input_name = 1; required FieldType input_type = 2 [default = STRING]; optional string default_val = 3; optional uint32 input_dim = 4 [default=1]; optional uint32 input_shape = 5 [default = 1]; // user-defined function for label. eg: tf.math.log1p, remap_lbl optional string user_define_fn = 6; // user-defined function path. eg: /samples/demo_script/process_lbl.py optional string user_define_fn_path = 7; // output field type of user-defined function. optional FieldType user_define_fn_res_type = 8; // ignore value optional string ignore_val = 9; } // set auto_expand_input_fields to true to // auto_expand field[1-21] to field1, field2, ..., field21 optional bool auto_expand_input_fields = 3 [default = false]; // label fields, normally only one field is used. // For multiple target models such as MMOE // multiple label_fields will be set. repeated string label_fields = 4; // label separator repeated string label_sep = 41; // label dimensions which need to be set when there // are labels have dimension > 1 repeated uint32 label_dim = 42; message LabelFunction { required string label_name = 1; required string label_func = 2; } // extra transformation functions that generate new labels repeated LabelFunction extra_label_func = 43; // whether to shuffle data optional bool shuffle = 5 [default = true]; // shufffle buffer for better performance, even shuffle buffer is set, // it is suggested to do full data shuffle before training // especially when the performance of models is not good. optional int32 shuffle_buffer_size = 11 [default = 32]; // The number of times a data source is read. If set to zero, the data source // will be reused indefinitely. optional uint32 num_epochs = 6 [default = 0]; // Number of decoded batches to prefetch. optional uint32 prefetch_size = 7 [default = 32]; // shard dataset to 1/num_workers in distribute mode // this param is not used anymore optional bool shard = 801 [default = false]; // shard by file, not by sample, valid only for CSVInput optional bool file_shard = 802 [default = false]; enum InputType { // csv format input, could be used in local or hdfs // support .gz compression(but not .tar.gz files) CSVInput = 10; // @Depreciated CSVInputV2 = 11; // extended csv format, allow quote in fields CSVInputEx = 12; // @Depreciated, has memory leak problem OdpsInput = 2; // odps input, used on pai OdpsInputV2 = 3; DataHubInput = 15; OdpsInputV3 = 9; RTPInput = 4; RTPInputV2 = 5; OdpsRTPInput = 601; OdpsRTPInputV2 = 602; TFRecordInput = 7; BatchTFRecordInput = 14; // for the purpose to debug performance bottleneck of // input pipelines DummyInput = 8; KafkaInput = 13; HiveInput = 16; HiveRTPInput = 17; HiveParquetInput = 18; // All features are packed into one field for fast copying to gpu, // and there are no feature preprocessing step, it is assumed that // features are preprocessed before training. // Requirements: python3 and tf2.x due to multiprocssing spawn and // RaggedTensor apis. ParquetInput = 19; // Features are not packed, and are preprocessing separately. // Requirements: python3 and tf2.x due to multiprocssing spawn and // RaggedTensor apis. ParquetInputV2 = 20; // c++ version of parquet dataset which currently are only available // with deeprec. ParquetInputV3 = 21; CriteoInput = 1001; } required InputType input_type = 10; // separator of column features, only used for CSVInput* // not used in OdpsInput* // binary separators are supported: // CTRL+A could be set as '\001' // CTRL+B could be set as '\002' // CTRL+C could be set as '\003' // for RTPInput and OdpsRTPInput it is usually set // to '\002' optional string separator = 12 [default = ',']; // parallel preproces of raw data, avoid using too small // or too large numbers(suggested be to small than // number of the cores) optional uint32 num_parallel_calls = 13 [default = 8]; // only used for OdpsInput/OdpsInputV2/OdpsRTPInput, comma separated // for RTPInput, selected_cols use indices as column names // such as '1,2,4', where 1,2 are label columns, and // 4 is the feature column, column 0,3 are not used, optional string selected_cols = 14 [default = '']; // selected col types, only used for OdpsInput/OdpsInputV2 // to avoid error setting of data types optional string selected_col_types = 15 [default = '']; // the input fields must be the same number and in the // same order as data in csv files or odps tables repeated Field input_fields = 16; // for RTPInput only optional string rtp_separator = 17 [default = ';']; // ignore some data errors // it is not suggested to set this parameter optional bool ignore_error = 18 [default=false]; // whether to use pai global shuffle queue, only for OdpsInput, // OdpsInputV2, OdpsRTPInputV2 optional bool pai_worker_queue = 19 [default = false]; optional int32 pai_worker_slice_num = 20 [default = 100]; // if true, one worker will duplicate the data of the chief node // and undertake the gradient computation of the chief node optional bool chief_redundant = 21 [default = false]; // input field for sample weight optional string sample_weight = 22; // the compression type of tfrecord optional string data_compression_type = 23 [default = '']; // n data for one feature in tfrecord optional uint32 n_data_batch_tfrecord = 24; // for csv files, may optionally with an header // in that case, input_name must match header name, // and the number and the order of input_fields // may not be the same as that in csv files. optional bool with_header = 25 [default = false]; repeated string feature_fields = 26; oneof sampler { NegativeSampler negative_sampler = 101; NegativeSamplerV2 negative_sampler_v2 = 102; HardNegativeSampler hard_negative_sampler = 103; HardNegativeSamplerV2 hard_negative_sampler_v2 = 104; NegativeSamplerInMemory negative_sampler_in_memory = 105; } optional uint32 eval_batch_size = 1001 [default = 4096]; optional bool drop_remainder = 1002 [default = false]; }

easy_rec/python/protos/dataset.proto (273 lines of code) (raw):