def _deduplicate()

in tensorflow_datasets/text/reddit_disentanglement.py [0:0]


def _deduplicate(data):
  """Remove duplicated records."""
  cnt = collections.Counter(row["id"] for row in data)
  nonuniq_ids = set(id for id, count in cnt.items() if count > 1)
  nonuniq_data = [row for row in data if row["id"] in nonuniq_ids]

  unique_data = [row for row in data if row["id"] not in nonuniq_ids]
  # Make sure same id records are next to each other for itertools.groupby
  nonuniq_data = sorted(nonuniq_data, key=lambda row: row["id"])
  for _, same_id_data in itertools.groupby(nonuniq_data, lambda row: row["id"]):
    same_id_data = list(same_id_data)
    if all(same_id_data[0] == x for x in same_id_data):
      unique_data.append(same_id_data[0])
    else:
      non_deleted_same_id_data = [
          row for row in same_id_data if row["author"] != "[deleted]"
      ]
      if len(non_deleted_same_id_data) != 1:
        raise ValueError("Found several message with id {} in the original"
                         " data".format(non_deleted_same_id_data[0]["id"]))
      unique_data.append(non_deleted_same_id_data[0])

  return sorted(
      unique_data, key=lambda row: (row["link_id"], row["created_utc"]))