in smallpond/logical/dataset.py [0:0]
def resolved_paths(self) -> List[str]:
"""
An ordered list of absolute paths of files.
File patterns are expanded to absolute paths.
Example::
>>> DataSet(['data/100.parquet', '/datasetA/*.parquet']).resolved_paths
['/datasetA/1.parquet', '/datasetA/2.parquet', '/home/user/data/100.parquet']
"""
if self._resolved_paths is None:
resolved_paths = []
wildcard_paths = []
for path in self.absolute_paths:
if has_magic(path):
wildcard_paths.append(path)
else:
resolved_paths.append(path)
if wildcard_paths:
if len(wildcard_paths) == 1:
expanded_paths = glob.glob(wildcard_paths[0], recursive=self.recursive)
else:
logger.debug(
"resolving {} paths with wildcards in {}",
len(wildcard_paths),
self,
)
with ThreadPoolExecutor(min(32, len(wildcard_paths))) as pool:
expanded_paths = [
p
for paths in pool.map(
lambda p: glob.glob(p, recursive=self.recursive),
wildcard_paths,
)
for p in paths
]
resolved_paths.extend(expanded_paths)
logger.debug(
"resolved {} files from {} wildcard path(s) in {}",
len(expanded_paths),
len(wildcard_paths),
self,
)
self._resolved_paths = sorted(resolved_paths)
return self._resolved_paths