in datasets/geos_fp/pipelines/_images/rolling_copy/script.py [0:0]
def scrape(source_path: str, webpage: bs4.BeautifulSoup) -> typing.List[str]:
file_paths = []
# Go through all the URLs in the page and collect the ones ending in ".nc4"
for a_tag in webpage.find_all("a"):
# The `href` property is the filename,
# e.g. GEOS.fp.asm.inst1_2d_smp_Nx.20210101_1700.V01.nc4
if a_tag.get("href") and a_tag["href"].endswith(".nc4"):
file_paths.append(f"{source_path}/{a_tag['href']}")
return file_paths