in parquet-cli/src/main/java/org/apache/parquet/cli/csv/AvroCSV.java [89:195]
private static Schema inferSchemaInternal(
String name, InputStream incoming, CSVProperties props, Set<String> requiredFields, boolean makeNullable)
throws IOException {
CSVReader reader = newReader(incoming, props);
String[] header;
String[] line;
if (props.useHeader) {
// read the header and then the first line
header = reader.readNext();
line = reader.readNext();
Objects.requireNonNull(line, "No content to infer schema");
} else if (props.header != null) {
header = newParser(props).parseLine(props.header);
line = reader.readNext();
Objects.requireNonNull(line, "No content to infer schema");
} else {
// use the first line to create a header
line = reader.readNext();
Objects.requireNonNull(line, "No content to infer schema");
header = new String[line.length];
for (int i = 0; i < line.length; i += 1) {
header[i] = "field_" + i;
}
}
Schema.Type[] types = new Schema.Type[header.length];
String[] values = new String[header.length];
boolean[] nullable = new boolean[header.length];
boolean[] empty = new boolean[header.length];
for (int processed = 0; processed < DEFAULT_INFER_LINES; processed += 1) {
if (line == null) {
break;
}
for (int i = 0; i < header.length; i += 1) {
if (i < line.length) {
if (types[i] == null) {
types[i] = inferFieldType(line[i]);
if (types[i] != null) {
// keep track of the value used
values[i] = line[i];
}
}
if (line[i] == null) {
nullable[i] = true;
} else if (line[i].isEmpty()) {
empty[i] = true;
}
} else {
// no value results in null
nullable[i] = true;
}
}
line = reader.readNext();
}
SchemaBuilder.FieldAssembler<Schema> fieldAssembler =
SchemaBuilder.record(name).fields();
// types may be missing, but fieldSchema will return a nullable string
for (int i = 0; i < header.length; i += 1) {
if (header[i] == null) {
throw new RuntimeException("Bad header for field " + i + ": null");
}
String fieldName = header[i].trim();
if (fieldName.isEmpty()) {
throw new RuntimeException("Bad header for field " + i + ": \"" + fieldName + "\"");
} else if (!isAvroCompatibleName(fieldName)) {
throw new RuntimeException("Bad header for field, should start with a character "
+ "or _ and can contain only alphanumerics and _ "
+ i
+ ": \"" + fieldName + "\"");
}
// the empty string is not considered null for string fields
boolean foundNull = (nullable[i] || (empty[i] && types[i] != Schema.Type.STRING));
if (requiredFields.contains(fieldName)) {
if (foundNull) {
throw new RuntimeException(
"Found null value for required field: " + fieldName + " (" + types[i] + ")");
}
fieldAssembler = fieldAssembler
.name(fieldName)
.doc("Type inferred from '" + sample(values[i]) + "'")
.type(schema(types[i], false))
.noDefault();
} else {
SchemaBuilder.GenericDefault<Schema> defaultBuilder = fieldAssembler
.name(fieldName)
.doc("Type inferred from '" + sample(values[i]) + "'")
.type(schema(types[i], makeNullable || foundNull));
if (makeNullable || foundNull) {
fieldAssembler = defaultBuilder.withDefault(null);
} else {
fieldAssembler = defaultBuilder.noDefault();
}
}
}
return fieldAssembler.endRecord();
}