def make_concepts_targets_and_randoms()

in tcav/tcav_examples/discrete/make_kdd99_concepts.py [0:0]


def make_concepts_targets_and_randoms(source_dir):
  # Make concept folders with the csv files
  # We need this, since sklearn does not provide us with column names or types
  # categorical_variables = ["protocol_type", "service","flag","labels"]
  columns = [
      "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
      "land", "wrong_fragment", "urgent", "hot", "m_failed_logins", "logged_in",
      "num_compromised", "root_shell", "su_attempted", "num_root",
      "num_file_creations", "num_shells", "num_access_files",
      "num_outbound_cmds", "is_host_login", "is_guest_login", "count",
      "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate",
      "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate",
      "dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate",
      "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
      "dst_host_srv_diff_host_rate", "dst_host_serror_rate",
      "dst_host_srv_serror_rate", "dst_host_rerror_rate",
      "dst_host_srv_rerror_rate", "label"
  ]

  data, labels = fetch_kddcup99(return_X_y=True)

  # Create dataframe from the KDD dataset
  dataset_columns = {columns[i]: data[:, i] for i in range(len(data[0]))}
  label_columns = {"labels": labels}
  dataset_columns.update(label_columns)
  dataframe = pd.DataFrame(dataset_columns)

  def make_concept_folder(dataframe, concept):
    # Create the folder and save the dataframe as a csv file there
    path = os.path.join(source_dir, concept)
    if not gfile.exists(path):
      gfile.makedirs(path)

    concept_file_name = os.path.join(path, concept + ".csv")
    dataframe.to_csv(concept_file_name, index=False)

  concept_less_df = dataframe[dataframe["dst_host_same_src_port_rate"] < 1]
  concept_more_df = dataframe[dataframe["dst_host_same_src_port_rate"] >= 1]

  make_concept_folder(concept_less_df,
                      "dst_host_same_src_port_rate_less_than_one")
  make_concept_folder(concept_more_df,
                      "dst_host_same_src_port_rate_more_than_one")

  # Making random_examples
  random_size = 10
  random_partitions = 11

  for i in range(random_partitions):
    random_partition_name = "random500_" + str(i)
    randoms = dataframe.sample(random_size)
    make_concept_folder(randoms, random_partition_name)

  # Make target folder
  targets = dataframe.labels.unique()
  print("Available concepts for KDD99 dataset are: ")
  print([
      "dst_host_same_src_port_rate_less_than_one",
      "dst_host_same_src_port_rate_more_than_one"
  ])
  print("\n")
  print("Available targets for KDD99 dataset are: ")
  print(targets)
  print("\n")
  print("Created 10 random folders \n")
  for target in targets:
    target_df = dataframe[dataframe["labels"] == target]
    make_concept_folder(target_df, target.decode("utf-8"))

  # make labels
  with open(os.path.join(source_dir, "labels.txt"), "w") as the_file:
    for target in targets:
      the_file.write(target.decode("utf-8") + "\n")