void ParquetFilePrinter::JSONPrint()

in cpp/src/parquet/printer.cc [244:406]


void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected_columns,
                                   const char* filename) {
  const FileMetaData* file_metadata = fileReader->metadata().get();
  stream << "{\n";
  stream << "  \"FileName\": \"" << filename << "\",\n";
  stream << "  \"Version\": \"" << ParquetVersionToString(file_metadata->version())
         << "\",\n";
  stream << "  \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n";
  stream << "  \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n";
  stream << "  \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() << "\",\n";
  stream << "  \"NumberOfRealColumns\": \""
         << file_metadata->schema()->group_node()->field_count() << "\",\n";
  stream << "  \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n";

  if (selected_columns.empty()) {
    for (int i = 0; i < file_metadata->num_columns(); i++) {
      selected_columns.push_back(i);
    }
  } else {
    for (auto i : selected_columns) {
      if (i < 0 || i >= file_metadata->num_columns()) {
        throw ParquetException("Selected column is out of range");
      }
    }
  }

  stream << "  \"Columns\": [\n";
  int c = 0;
  for (auto i : selected_columns) {
    const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
    stream << "     { \"Id\": \"" << i << "\","
           << " \"Name\": \"" << descr->path()->ToDotString() << "\","
           << " \"PhysicalType\": \""
           << TypeToString(descr->physical_type(), descr->type_length()) << "\","
           << " \"ConvertedType\": \"" << ConvertedTypeToString(descr->converted_type())
           << "\","
           << " \"LogicalType\": " << (descr->logical_type())->ToJSON() << " }";
    c++;
    if (c != static_cast<int>(selected_columns.size())) {
      stream << ",\n";
    }
  }

  stream << "\n  ],\n  \"RowGroups\": [\n";
  for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
    stream << "     {\n       \"Id\": \"" << r << "\", ";

    auto group_reader = fileReader->RowGroup(r);
    std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);

    stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", ";
    stream << " \"TotalCompressedBytes\": \"" << group_metadata->total_compressed_size()
           << "\", ";
    auto row_group_sorting_columns = group_metadata->sorting_columns();
    if (!row_group_sorting_columns.empty()) {
      stream << " \"SortColumns\": [\n";
      for (size_t i = 0; i < row_group_sorting_columns.size(); i++) {
        stream << "         {\"column_idx\": " << row_group_sorting_columns[i].column_idx
               << ", \"descending\": " << row_group_sorting_columns[i].descending
               << ", \"nulls_first\": " << row_group_sorting_columns[i].nulls_first
               << "}";
        if (i + 1 != row_group_sorting_columns.size()) {
          stream << ",";
        }
        stream << '\n';
      }
      stream << "       ], ";
    }
    stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n";

    // Print column metadata
    stream << "       \"ColumnChunks\": [\n";
    int c1 = 0;
    for (auto i : selected_columns) {
      auto column_chunk = group_metadata->ColumnChunk(i);
      std::shared_ptr<Statistics> stats = column_chunk->statistics();

      const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
      stream << "          {\"Id\": \"" << i << "\", \"Values\": \""
             << column_chunk->num_values() << "\", "
             << "\"StatsSet\": ";
      if (column_chunk->is_stats_set()) {
        stream << R"("True", "Stats": {)";
        if (stats->HasNullCount()) {
          stream << R"("NumNulls": ")" << stats->null_count() << "\"";
        }
        if (stats->HasDistinctCount()) {
          stream << ", "
                 << R"("DistinctValues": ")" << stats->distinct_count() << "\"";
        }
        if (stats->HasMinMax()) {
          std::string min = stats->EncodeMin(), max = stats->EncodeMax();
          stream << ", "
                 << R"("Max": ")" << FormatStatValue(descr->physical_type(), max)
                 << "\", "
                 << R"("Min": ")" << FormatStatValue(descr->physical_type(), min) << "\"";
        }
        stream << " },";
      } else {
        stream << "\"False\",";
      }
      stream << "\n           \"Compression\": \""
             << ::arrow::internal::AsciiToUpper(
                    Codec::GetCodecAsString(column_chunk->compression()))
             << R"(", "Encodings": )";
      stream << "\"";
      if (column_chunk->encoding_stats().empty()) {
        for (auto encoding : column_chunk->encodings()) {
          stream << EncodingToString(encoding) << " ";
        }
      } else {
        PrintPageEncodingStats(stream, column_chunk->encoding_stats());
      }
      stream << "\"";
      stream << ", "
             << R"("UncompressedSize": ")" << column_chunk->total_uncompressed_size()
             << R"(", "CompressedSize": ")" << column_chunk->total_compressed_size()
             << "\"";

      if (column_chunk->bloom_filter_offset()) {
        // Output BloomFilter {offset, length}
        stream << ", \"BloomFilter\": {"
               << R"("offset": ")" << column_chunk->bloom_filter_offset().value() << "\"";
        if (column_chunk->bloom_filter_length()) {
          stream << R"(, "length": ")" << column_chunk->bloom_filter_length().value()
                 << "\"";
        }
        stream << "}";
      }

      if (column_chunk->GetColumnIndexLocation()) {
        auto location = column_chunk->GetColumnIndexLocation().value();
        // Output ColumnIndex {offset, length}
        stream << ", \"ColumnIndex\": {"
               << R"("offset": ")" << location.offset;
        stream << R"(", "length": ")" << location.length;
        stream << "\"}";
      }

      if (column_chunk->GetOffsetIndexLocation()) {
        auto location = column_chunk->GetOffsetIndexLocation().value();
        // Output OffsetIndex {offset, length}
        stream << ", \"OffsetIndex\": {"
               << R"("offset": ")" << location.offset << "\"";
        stream << R"(, "length": ")" << location.length << "\"";
        stream << "}";
      }

      // end of a ColumnChunk
      stream << " }";
      c1++;
      if (c1 != static_cast<int>(selected_columns.size())) {
        stream << ",\n";
      }
    }

    stream << "\n        ]\n     }";
    if ((r + 1) != static_cast<int>(file_metadata->num_row_groups())) {
      stream << ",\n";
    }
  }
  stream << "\n  ]\n}\n";
}