private String doScan()

in plugins/transforms/textfile/src/main/java/org/apache/hop/pipeline/transforms/fileinput/TextFileCSVImportProgressDialog.java [167:615]


  private String doScan(IProgressMonitor monitor, final boolean failOnParseError)
      throws HopException {
    if (samples > 0) {
      monitor.beginTask(
          BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningFile"),
          samples + 1);
    } else {
      monitor.beginTask(
          BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.ScanningFile"), 2);
    }

    String line = "";
    long fileLineNumber = 0;

    DecimalFormatSymbols dfs = new DecimalFormatSymbols();

    int nrFields = meta.getInputFields().length;

    IRowMeta outputRowMeta = new RowMeta();
    meta.getFields(outputRowMeta, null, null, null, variables, null);

    // Remove the storage meta-data (don't go for lazy conversion during scan)
    for (IValueMeta valueMeta : outputRowMeta.getValueMetaList()) {
      valueMeta.setStorageMetadata(null);
      valueMeta.setStorageType(IValueMeta.STORAGE_TYPE_NORMAL);
    }

    IRowMeta convertRowMeta = outputRowMeta.cloneToType(IValueMeta.TYPE_STRING);

    // How many null values?
    int[] nrnull = new int[nrFields]; // How many times null value?

    // String info
    String[] minstr = new String[nrFields]; // min string
    String[] maxstr = new String[nrFields]; // max string
    boolean[] firststr = new boolean[nrFields]; // first occ. of string?

    // Date info
    boolean[] isDate = new boolean[nrFields]; // is the field perhaps a Date?
    int[] dateFormatCount = new int[nrFields]; // How many date formats work?
    boolean[][] dateFormat =
        new boolean[nrFields][Const.getDateFormats().length]; // What are the date formats that
    // work?
    Date[][] minDate = new Date[nrFields][Const.getDateFormats().length]; // min date value
    Date[][] maxDate = new Date[nrFields][Const.getDateFormats().length]; // max date value

    // Number info
    boolean[] isNumber = new boolean[nrFields]; // is the field perhaps a Number?
    int[] numberFormatCount = new int[nrFields]; // How many number formats work?
    boolean[][] numberFormat =
        new boolean[nrFields][Const.getNumberFormats().length]; // What are the number format
    // that work?
    double[][] minValue = new double[nrFields][Const.getDateFormats().length]; // min number value
    double[][] maxValue = new double[nrFields][Const.getDateFormats().length]; // max number value
    int[][] numberPrecision =
        new int[nrFields][Const.getNumberFormats().length]; // remember the precision?
    int[][] numberLength =
        new int[nrFields][Const.getNumberFormats().length]; // remember the length?

    for (int i = 0; i < nrFields; i++) {
      TextFileInputField field = meta.getInputFields()[i];

      if (log.isDebug()) {
        debug = "init field #" + i;
      }

      if (replaceMeta) { // Clear previous info...

        field.setName(meta.getInputFields()[i].getName());
        field.setType(meta.getInputFields()[i].getType());
        field.setFormat("");
        field.setLength(-1);
        field.setPrecision(-1);
        field.setCurrencySymbol(dfs.getCurrencySymbol());
        field.setDecimalSymbol("" + dfs.getDecimalSeparator());
        field.setGroupSymbol("" + dfs.getGroupingSeparator());
        field.setNullString("-");
        field.setTrimType(IValueMeta.TRIM_TYPE_NONE);
      }

      nrnull[i] = 0;
      minstr[i] = "";
      maxstr[i] = "";
      firststr[i] = true;

      // Init data guess
      isDate[i] = true;
      for (int j = 0; j < Const.getDateFormats().length; j++) {
        dateFormat[i][j] = true;
        minDate[i][j] = Const.MAX_DATE;
        maxDate[i][j] = Const.MIN_DATE;
      }
      dateFormatCount[i] = Const.getDateFormats().length;

      // Init number guess
      isNumber[i] = true;
      for (int j = 0; j < Const.getNumberFormats().length; j++) {
        numberFormat[i][j] = true;
        minValue[i][j] = Double.MAX_VALUE;
        maxValue[i][j] = -Double.MAX_VALUE;
        numberPrecision[i][j] = -1;
        numberLength[i][j] = -1;
      }
      numberFormatCount[i] = Const.getNumberFormats().length;
    }

    IInputFileMeta strinfo = (IInputFileMeta) meta.clone();
    for (int i = 0; i < nrFields; i++) {
      strinfo.getInputFields()[i].setType(IValueMeta.TYPE_STRING);
    }

    // Sample <samples> rows...
    debug = "get first line";

    StringBuilder lineBuffer = new StringBuilder(256);
    int fileFormatType = meta.getFileFormatTypeNr();

    // If the file has a header we overwrite the first line
    // However, if it doesn't have a header, take a new line
    //

    line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineBuffer);
    fileLineNumber++;

    if (meta.hasHeader()) {
      int skipped = 0;
      while (line != null && skipped < meta.getNrHeaderLines()) {
        line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineBuffer);
        skipped++;
        fileLineNumber++;
      }
    }
    int linenr = 1;

    List<StringEvaluator> evaluators = new ArrayList<>();

    // Allocate number and date parsers
    DecimalFormat df2 = (DecimalFormat) NumberFormat.getInstance();
    DecimalFormatSymbols dfs2 = new DecimalFormatSymbols();
    SimpleDateFormat daf2 = new SimpleDateFormat();

    boolean errorFound = false;
    while (!errorFound
        && line != null
        && (linenr <= samples || samples == 0)
        && !monitor.isCanceled()) {
      monitor.subTask(
          BaseMessages.getString(
              PKG, "TextFileCSVImportProgressDialog.Task.ScanningLine", "" + linenr));
      if (samples > 0) {
        monitor.worked(1);
      }

      if (log.isDebug()) {
        debug = "convert line #" + linenr + " to row";
      }
      IRowMeta rowMeta = new RowMeta();
      meta.getFields(rowMeta, "transformName", null, null, variables, null);
      // Remove the storage meta-data (don't go for lazy conversion during scan)
      for (IValueMeta valueMeta : rowMeta.getValueMetaList()) {
        valueMeta.setStorageMetadata(null);
        valueMeta.setStorageType(IValueMeta.STORAGE_TYPE_NORMAL);
      }

      String delimiter = variables.resolve(meta.getSeparator());
      String enclosure = variables.resolve(meta.getEnclosure());
      String escapeCharacter = variables.resolve(meta.getEscapeCharacter());
      Object[] r =
          TextFileInput.convertLineToRow(
              log,
              new TextFileLine(line, fileLineNumber, null),
              strinfo,
              null,
              0,
              outputRowMeta,
              convertRowMeta,
              meta.getFilePaths(variables)[0],
              rownumber,
              delimiter,
              enclosure,
              escapeCharacter,
              null,
              false,
              false,
              false,
              false,
              false,
              false,
              false,
              false,
              null,
              null,
              false,
              null,
              null,
              null,
              null,
              0,
              failOnParseError);

      if (r == null) {
        errorFound = true;
        continue;
      }
      rownumber++;
      for (int i = 0; i < nrFields && i < r.length; i++) {
        StringEvaluator evaluator;
        if (i >= evaluators.size()) {
          evaluator = new StringEvaluator(true);
          evaluators.add(evaluator);
        } else {
          evaluator = evaluators.get(i);
        }

        String string = getStringFromRow(rowMeta, r, i, failOnParseError);

        if (i == 0) {
          System.out.println();
        }
        evaluator.evaluateString(string);
      }

      fileLineNumber++;
      if (r != null) {
        linenr++;
      }

      // Grab another line...
      //
      line = TextFileInput.getLine(log, reader, encodingType, fileFormatType, lineBuffer);
    }

    monitor.worked(1);
    monitor.setTaskName(
        BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Task.AnalyzingResults"));

    // Show information on items using a dialog box
    //
    StringBuilder message = new StringBuilder();
    message.append(
        BaseMessages.getString(
            PKG, "TextFileCSVImportProgressDialog.Info.ResultAfterScanning", "" + (linenr - 1)));
    message.append(
        BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.HorizontalLine"));

    for (int i = 0; i < nrFields; i++) {
      TextFileInputField field = meta.getInputFields()[i];
      StringEvaluator evaluator = evaluators.get(i);
      List<StringEvaluationResult> evaluationResults = evaluator.getStringEvaluationResults();

      // If we didn't find any matching result, it's a String...
      //
      StringEvaluationResult result = evaluator.getAdvicedResult();
      if (evaluationResults.isEmpty()) {
        field.setType(IValueMeta.TYPE_STRING);
        field.setLength(evaluator.getMaxLength());
      }
      if (result != null) {
        // Take the first option we find, list the others below...
        //
        IValueMeta conversionMeta = result.getConversionMeta();
        field.setType(conversionMeta.getType());
        field.setTrimType(conversionMeta.getTrimType());
        field.setFormat(conversionMeta.getConversionMask());
        field.setDecimalSymbol(conversionMeta.getDecimalSymbol());
        field.setGroupSymbol(conversionMeta.getGroupingSymbol());
        field.setLength(conversionMeta.getLength());
        field.setPrecision(conversionMeta.getPrecision());

        nrnull[i] = result.getNrNull();
        minstr[i] = result.getMin() == null ? "" : result.getMin().toString();
        maxstr[i] = result.getMax() == null ? "" : result.getMax().toString();
      }

      message.append(
          BaseMessages.getString(
              PKG, "TextFileCSVImportProgressDialog.Info.FieldNumber", "" + (i + 1)));

      message.append(
          BaseMessages.getString(
              PKG, "TextFileCSVImportProgressDialog.Info.FieldName", field.getName()));
      message.append(
          BaseMessages.getString(
              PKG, "TextFileCSVImportProgressDialog.Info.FieldType", field.getTypeDesc()));

      switch (field.getType()) {
        case IValueMeta.TYPE_NUMBER:
          message.append(
              BaseMessages.getString(
                  PKG,
                  "TextFileCSVImportProgressDialog.Info.EstimatedLength",
                  (field.getLength() < 0 ? "-" : "" + field.getLength())));
          message.append(
              BaseMessages.getString(
                  PKG,
                  "TextFileCSVImportProgressDialog.Info.EstimatedPrecision",
                  field.getPrecision() < 0 ? "-" : "" + field.getPrecision()));
          message.append(
              BaseMessages.getString(
                  PKG, "TextFileCSVImportProgressDialog.Info.NumberFormat", field.getFormat()));

          if (!evaluationResults.isEmpty()) {
            if (evaluationResults.size() > 1) {
              message.append(
                  BaseMessages.getString(
                      PKG, "TextFileCSVImportProgressDialog.Info.WarnNumberFormat"));
            }

            for (StringEvaluationResult seResult : evaluationResults) {
              String mask = seResult.getConversionMeta().getConversionMask();

              message.append(
                  BaseMessages.getString(
                      PKG, "TextFileCSVImportProgressDialog.Info.NumberFormat2", mask));
              message.append(
                  BaseMessages.getString(
                      PKG,
                      "TextFileCSVImportProgressDialog.Info.TrimType",
                      seResult.getConversionMeta().getTrimType()));
              message.append(
                  BaseMessages.getString(
                      PKG,
                      "TextFileCSVImportProgressDialog.Info.NumberMinValue",
                      seResult.getMin()));
              message.append(
                  BaseMessages.getString(
                      PKG,
                      "TextFileCSVImportProgressDialog.Info.NumberMaxValue",
                      seResult.getMax()));

              try {
                df2.applyPattern(mask);
                df2.setDecimalFormatSymbols(dfs2);
                double mn = df2.parse(seResult.getMin().toString()).doubleValue();
                message.append(
                    BaseMessages.getString(
                        PKG,
                        "TextFileCSVImportProgressDialog.Info.NumberExample",
                        mask,
                        seResult.getMin(),
                        Double.toString(mn)));
              } catch (Exception e) {
                if (log.isDetailed()) {
                  log.logDetailed(
                      "This is unexpected: parsing ["
                          + seResult.getMin()
                          + "] with format ["
                          + mask
                          + "] did not work.");
                }
              }
            }
          }
          message.append(
              BaseMessages.getString(
                  PKG, "TextFileCSVImportProgressDialog.Info.NumberNrNullValues", "" + nrnull[i]));
          break;
        case IValueMeta.TYPE_STRING:
          message.append(
              BaseMessages.getString(
                  PKG,
                  "TextFileCSVImportProgressDialog.Info.StringMaxLength",
                  "" + field.getLength()));
          message.append(
              BaseMessages.getString(
                  PKG, "TextFileCSVImportProgressDialog.Info.StringMinValue", minstr[i]));
          message.append(
              BaseMessages.getString(
                  PKG, "TextFileCSVImportProgressDialog.Info.StringMaxValue", maxstr[i]));
          message.append(
              BaseMessages.getString(
                  PKG, "TextFileCSVImportProgressDialog.Info.StringNrNullValues", "" + nrnull[i]));
          break;
        case IValueMeta.TYPE_DATE:
          message.append(
              BaseMessages.getString(
                  PKG,
                  "TextFileCSVImportProgressDialog.Info.DateMaxLength",
                  field.getLength() < 0 ? "-" : "" + field.getLength()));
          message.append(
              BaseMessages.getString(
                  PKG, "TextFileCSVImportProgressDialog.Info.DateFormat", field.getFormat()));
          if (dateFormatCount[i] > 1) {
            message.append(
                BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.WarnDateFormat"));
          }
          if (!Utils.isEmpty(minstr[i])) {
            for (int x = 0; x < Const.getDateFormats().length; x++) {
              if (dateFormat[i][x]) {
                message.append(
                    BaseMessages.getString(
                        PKG,
                        "TextFileCSVImportProgressDialog.Info.DateFormat2",
                        Const.getDateFormats()[x]));
                Date mindate = minDate[i][x];
                Date maxdate = maxDate[i][x];
                message.append(
                    BaseMessages.getString(
                        PKG,
                        "TextFileCSVImportProgressDialog.Info.DateMinValue",
                        mindate.toString()));
                message.append(
                    BaseMessages.getString(
                        PKG,
                        "TextFileCSVImportProgressDialog.Info.DateMaxValue",
                        maxdate.toString()));

                daf2.applyPattern(Const.getDateFormats()[x]);
                try {
                  Date md = daf2.parse(minstr[i]);
                  message.append(
                      BaseMessages.getString(
                          PKG,
                          "TextFileCSVImportProgressDialog.Info.DateExample",
                          Const.getDateFormats()[x],
                          minstr[i],
                          md.toString()));
                } catch (Exception e) {
                  if (log.isDetailed()) {
                    log.logDetailed(
                        "This is unexpected: parsing ["
                            + minstr[i]
                            + "] with format ["
                            + Const.getDateFormats()[x]
                            + "] did not work.");
                  }
                }
              }
            }
          }
          message.append(
              BaseMessages.getString(
                  PKG, "TextFileCSVImportProgressDialog.Info.DateNrNullValues", "" + nrnull[i]));
          break;
        default:
          break;
      }
      if (nrnull[i] == linenr - 1) {
        message.append(
            BaseMessages.getString(PKG, "TextFileCSVImportProgressDialog.Info.AllNullValues"));
      }
      message.append(Const.CR);
    }

    monitor.worked(1);
    monitor.done();

    return message.toString();
  }