in tensorflow_examples/lite/model_maker/core/task/model_spec/text_spec.py [0:0]
def evaluate(self, model, tflite_filepath, dataset, num_steps, eval_examples,
eval_features, predict_file, version_2_with_negative,
max_answer_length, null_score_diff_threshold, verbose_logging,
output_dir):
"""Evaluate QA model.
Args:
model: The keras model to be evaluated.
tflite_filepath: File path to the TFLite model.
dataset: tf.data.Dataset used for evaluation.
num_steps: Number of steps to evaluate the model.
eval_examples: List of `squad_lib.SquadExample` for evaluation data.
eval_features: List of `squad_lib.InputFeatures` for evaluation data.
predict_file: The input predict file.
version_2_with_negative: Whether the input predict file is SQuAD 2.0
format.
max_answer_length: The maximum length of an answer that can be generated.
This is needed because the start and end predictions are not conditioned
on one another.
null_score_diff_threshold: If null_score - best_non_null is greater than
the threshold, predict null. This is only used for SQuAD v2.
verbose_logging: If true, all of the warnings related to data processing
will be printed. A number of warnings are expected for a normal SQuAD
evaluation.
output_dir: The output directory to save output to json files:
predictions.json, nbest_predictions.json, null_odds.json. If None, skip
saving to json files.
Returns:
A dict contains two metrics: Exact match rate and F1 score.
"""
if model is not None and tflite_filepath is not None:
raise ValueError('Exactly one of the paramaters `model` and '
'`tflite_filepath` should be set.')
elif model is None and tflite_filepath is None:
raise ValueError('At least one of the parameters `model` and '
'`tflite_filepath` are None.')
if tflite_filepath is not None:
all_results = self.predict_tflite(tflite_filepath, dataset)
else:
all_results = self.predict(model, dataset, num_steps)
all_predictions, all_nbest_json, scores_diff_json = (
squad_lib.postprocess_output(
eval_examples,
eval_features,
all_results,
n_best_size=20,
max_answer_length=max_answer_length,
do_lower_case=self.do_lower_case,
version_2_with_negative=version_2_with_negative,
null_score_diff_threshold=null_score_diff_threshold,
verbose=verbose_logging))
if output_dir is not None:
dump_to_files(all_predictions, all_nbest_json, scores_diff_json,
version_2_with_negative, output_dir)
dataset_json = file_util.load_json_file(predict_file)
pred_dataset = dataset_json['data']
if version_2_with_negative:
eval_metrics = squad_evaluate_v2_0.evaluate(pred_dataset, all_predictions,
scores_diff_json)
else:
eval_metrics = squad_evaluate_v1_1.evaluate(pred_dataset, all_predictions)
return eval_metrics