int DiB_trainFromFiles()

in programs/dibio.c [313:441]


int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
                       const char** fileNamesTable, int nbFiles, size_t chunkSize,
                       ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
                       ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit)
{
    fileStats fs;
    size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */
    int nbSamplesLoaded; /* nb of samples effectively loaded in srcBuffer */
    size_t loadedSize; /* total data loaded in srcBuffer for all samples */
    void* srcBuffer /* contiguous buffer with training data/samples */;
    void* const dictBuffer = malloc(maxDictSize);
    int result = 0;

    int const displayLevel = params ? params->zParams.notificationLevel :
        coverParams ? coverParams->zParams.notificationLevel :
        fastCoverParams ? fastCoverParams->zParams.notificationLevel : 0;

    /* Shuffle input files before we start assessing how much sample datA to load.
       The purpose of the shuffle is to pick random samples when the sample
       set is larger than what we can load in memory. */
    DISPLAYLEVEL(3, "Shuffling input files\n");
    DiB_shuffle(fileNamesTable, nbFiles);

    /* Figure out how much sample data to load with how many samples */
    fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);

    {
        int const memMult = params ? MEMMULT :
                            coverParams ? COVER_MEMMULT:
                            FASTCOVER_MEMMULT;
        size_t const maxMem =  DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
        /* Limit the size of the training data to the free memory */
        /* Limit the size of the training data to 2GB */
        /* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */
        loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
        if (memLimit != 0) {
            DISPLAYLEVEL(2, "!  Warning : setting manual memory limit for dictionary training data at %u MB \n",
                (unsigned)(memLimit / (1 MB)));
            loadedSize = (size_t)MIN(loadedSize, memLimit);
        }
        srcBuffer = malloc(loadedSize+NOISELENGTH);
        sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
    }

    /* Checks */
    if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer))
        EXM_THROW(12, "not enough memory for DiB_trainFiles");   /* should not happen */
    if (fs.oneSampleTooLarge) {
        DISPLAYLEVEL(2, "!  Warning : some sample(s) are very large \n");
        DISPLAYLEVEL(2, "!  Note that dictionary is only useful for small samples. \n");
        DISPLAYLEVEL(2, "!  As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
    }
    if (fs.nbSamples < 5) {
        DISPLAYLEVEL(2, "!  Warning : nb of samples too low for proper processing ! \n");
        DISPLAYLEVEL(2, "!  Please provide _one file per sample_. \n");
        DISPLAYLEVEL(2, "!  Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
        EXM_THROW(14, "nb of samples too low");   /* we now clearly forbid this case */
    }
    if (fs.totalSizeToLoad < (S64)maxDictSize * 8) {
        DISPLAYLEVEL(2, "!  Warning : data size of samples too small for target dictionary size \n");
        DISPLAYLEVEL(2, "!  Samples should be about 100x larger than target dictionary size \n");
    }

    /* init */
    if ((S64)loadedSize < fs.totalSizeToLoad)
        DISPLAYLEVEL(1, "Training samples set too large (%u MB); training on %u MB only...\n",
            (unsigned)(fs.totalSizeToLoad / (1 MB)),
            (unsigned)(loadedSize / (1 MB)));

    /* Load input buffer */
    nbSamplesLoaded = DiB_loadFiles(
        srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable,
        nbFiles, chunkSize, displayLevel);

    {   size_t dictSize = ZSTD_error_GENERIC;
        if (params) {
            DiB_fillNoise((char*)srcBuffer + loadedSize, NOISELENGTH);   /* guard band, for end of buffer condition */
            dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize,
                                                    srcBuffer, sampleSizes, nbSamplesLoaded,
                                                    *params);
        } else if (coverParams) {
            if (optimize) {
              dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
                                                             srcBuffer, sampleSizes, nbSamplesLoaded,
                                                             coverParams);
              if (!ZDICT_isError(dictSize)) {
                  unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
                  DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d,
                              coverParams->steps, splitPercentage);
              }
            } else {
              dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
                                                     sampleSizes, nbSamplesLoaded, *coverParams);
            }
        } else if (fastCoverParams != NULL) {
            if (optimize) {
              dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,
                                                              srcBuffer, sampleSizes, nbSamplesLoaded,
                                                              fastCoverParams);
              if (!ZDICT_isError(dictSize)) {
                unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);
                DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastCoverParams->k,
                            fastCoverParams->d, fastCoverParams->f, fastCoverParams->steps, splitPercentage,
                            fastCoverParams->accel);
              }
            } else {
              dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,
                                                        sampleSizes, nbSamplesLoaded, *fastCoverParams);
            }
        } else {
            assert(0 /* Impossible */);
        }
        if (ZDICT_isError(dictSize)) {
            DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
            result = 1;
            goto _cleanup;
        }
        /* save dict */
        DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (unsigned)dictSize, dictFileName);
        DiB_saveDict(dictFileName, dictBuffer, dictSize);
    }

    /* clean up */
_cleanup:
    free(srcBuffer);
    free(sampleSizes);
    free(dictBuffer);
    return result;
}