def clean_all_text_data()

in data_preparation/metadata_completion/text_cleaner.py [0:0]


def clean_all_text_data(metadataList, pathInDir, pathOutDir):

    pathInDir = os.path.abspath(pathInDir)
    pathOutDir = os.path.abspath(pathOutDir)

    if pathInDir == pathOutDir:
        raise ValueError("Can't save the data in the same directory \
                          as the originals")

    bar = progressbar.ProgressBar(maxval=len(metadataList))
    bar.start()
    nCleaned = 0
    nMissing = 0
    nNotWorking = 0
    emptyTxt = []
    out = []

    for index, metadataName in enumerate(metadataList):
        bar.update(index)
        textFileName = get_txt_name(metadataName)
        pathInFile = os.path.join(pathInDir, textFileName)
        outPathFile = os.path.join(pathOutDir, textFileName)

        if not os.path.isfile(pathInFile):
            status = "missing"
            nMissing += 1
        else:

            assert(pathInFile != outPathFile)

            with open(os.path.join(pathInDir, metadataName), 'rb') as file:
                urlSource = json.load(file)["url_text_source"]

            if not is_guttenberg_url(urlSource):
                os.popen(f'cp {pathInFile} {outPathFile}')
                status = "clear"
            else:
                outData = loadData(pathInFile)

                if outData is None:
                    nNotWorking += 1
                    if find404Error(pathInFile):
                        emptyTxt.append(pathInFile)
                        status = "missing"
                    else:
                        status = "noisy"
                else:
                    with open(outPathFile, 'w') as file:
                        file.write(outData)
                    status = "clear"
        out.append((metadataName, status))
        nCleaned += 1

    bar.finish()
    print(f"Out of {len(metadataList)} items")
    print(f"{nCleaned} files were cleaned and saved to {pathOutDir}")
    print(f"{nNotWorking} files didn't match the good format among which {len(emptyTxt)} were empty")
    print(f"{nMissing} files were missing")
    return out