private static Schema inferSchemaInternal()

in parquet-cli/src/main/java/org/apache/parquet/cli/csv/AvroCSV.java [89:195]


  private static Schema inferSchemaInternal(
      String name, InputStream incoming, CSVProperties props, Set<String> requiredFields, boolean makeNullable)
      throws IOException {
    CSVReader reader = newReader(incoming, props);

    String[] header;
    String[] line;
    if (props.useHeader) {
      // read the header and then the first line
      header = reader.readNext();
      line = reader.readNext();
      Objects.requireNonNull(line, "No content to infer schema");
    } else if (props.header != null) {
      header = newParser(props).parseLine(props.header);
      line = reader.readNext();
      Objects.requireNonNull(line, "No content to infer schema");
    } else {
      // use the first line to create a header
      line = reader.readNext();
      Objects.requireNonNull(line, "No content to infer schema");
      header = new String[line.length];
      for (int i = 0; i < line.length; i += 1) {
        header[i] = "field_" + i;
      }
    }

    Schema.Type[] types = new Schema.Type[header.length];
    String[] values = new String[header.length];
    boolean[] nullable = new boolean[header.length];
    boolean[] empty = new boolean[header.length];

    for (int processed = 0; processed < DEFAULT_INFER_LINES; processed += 1) {
      if (line == null) {
        break;
      }

      for (int i = 0; i < header.length; i += 1) {
        if (i < line.length) {
          if (types[i] == null) {
            types[i] = inferFieldType(line[i]);
            if (types[i] != null) {
              // keep track of the value used
              values[i] = line[i];
            }
          }

          if (line[i] == null) {
            nullable[i] = true;
          } else if (line[i].isEmpty()) {
            empty[i] = true;
          }
        } else {
          // no value results in null
          nullable[i] = true;
        }
      }

      line = reader.readNext();
    }

    SchemaBuilder.FieldAssembler<Schema> fieldAssembler =
        SchemaBuilder.record(name).fields();

    // types may be missing, but fieldSchema will return a nullable string
    for (int i = 0; i < header.length; i += 1) {
      if (header[i] == null) {
        throw new RuntimeException("Bad header for field " + i + ": null");
      }

      String fieldName = header[i].trim();

      if (fieldName.isEmpty()) {
        throw new RuntimeException("Bad header for field " + i + ": \"" + fieldName + "\"");
      } else if (!isAvroCompatibleName(fieldName)) {
        throw new RuntimeException("Bad header for field, should start with a character "
            + "or _ and can contain only alphanumerics and _ "
            + i
            + ": \"" + fieldName + "\"");
      }

      // the empty string is not considered null for string fields
      boolean foundNull = (nullable[i] || (empty[i] && types[i] != Schema.Type.STRING));

      if (requiredFields.contains(fieldName)) {
        if (foundNull) {
          throw new RuntimeException(
              "Found null value for required field: " + fieldName + " (" + types[i] + ")");
        }
        fieldAssembler = fieldAssembler
            .name(fieldName)
            .doc("Type inferred from '" + sample(values[i]) + "'")
            .type(schema(types[i], false))
            .noDefault();
      } else {
        SchemaBuilder.GenericDefault<Schema> defaultBuilder = fieldAssembler
            .name(fieldName)
            .doc("Type inferred from '" + sample(values[i]) + "'")
            .type(schema(types[i], makeNullable || foundNull));
        if (makeNullable || foundNull) {
          fieldAssembler = defaultBuilder.withDefault(null);
        } else {
          fieldAssembler = defaultBuilder.noDefault();
        }
      }
    }
    return fieldAssembler.endRecord();
  }