in data_preparation/metadata_completion/text_cleaner.py [0:0]
def clean_all_text_data(metadataList, pathInDir, pathOutDir):
pathInDir = os.path.abspath(pathInDir)
pathOutDir = os.path.abspath(pathOutDir)
if pathInDir == pathOutDir:
raise ValueError("Can't save the data in the same directory \
as the originals")
bar = progressbar.ProgressBar(maxval=len(metadataList))
bar.start()
nCleaned = 0
nMissing = 0
nNotWorking = 0
emptyTxt = []
out = []
for index, metadataName in enumerate(metadataList):
bar.update(index)
textFileName = get_txt_name(metadataName)
pathInFile = os.path.join(pathInDir, textFileName)
outPathFile = os.path.join(pathOutDir, textFileName)
if not os.path.isfile(pathInFile):
status = "missing"
nMissing += 1
else:
assert(pathInFile != outPathFile)
with open(os.path.join(pathInDir, metadataName), 'rb') as file:
urlSource = json.load(file)["url_text_source"]
if not is_guttenberg_url(urlSource):
os.popen(f'cp {pathInFile} {outPathFile}')
status = "clear"
else:
outData = loadData(pathInFile)
if outData is None:
nNotWorking += 1
if find404Error(pathInFile):
emptyTxt.append(pathInFile)
status = "missing"
else:
status = "noisy"
else:
with open(outPathFile, 'w') as file:
file.write(outData)
status = "clear"
out.append((metadataName, status))
nCleaned += 1
bar.finish()
print(f"Out of {len(metadataList)} items")
print(f"{nCleaned} files were cleaned and saved to {pathOutDir}")
print(f"{nNotWorking} files didn't match the good format among which {len(emptyTxt)} were empty")
print(f"{nMissing} files were missing")
return out