init/jupyter/gravitino-fileset-example.ipynb (340 lines of code) (raw):
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "a7921ace-4d57-4a8e-934c-7e49a4f268e6",
"metadata": {},
"outputs": [],
"source": [
"pip install hdfs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "db993abd-02bd-465b-a94a-ad7ed7e0e234",
"metadata": {},
"outputs": [],
"source": [
"from hdfs import InsecureClient\n",
"import os\n",
"\n",
"# Create a HDFS connector client\n",
"hdfs_client = InsecureClient(\"http://hive:50070\", user='root')\n",
"\n",
"# List HDFS file and directories\n",
"print(hdfs_client.list('/user/gravitino'))\n",
"\n",
"hdfs_client.delete(\"/user/gravitino\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fcfb73be-e369-4543-b78d-fb1cd061c9e1",
"metadata": {},
"outputs": [],
"source": [
"pip install apache-gravitino==0.8.0"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "384715f6-a200-448f-b3b2-bf03931769bf",
"metadata": {},
"outputs": [],
"source": [
"from typing import Dict, List\n",
"from gravitino import NameIdentifier, GravitinoAdminClient, GravitinoClient, Catalog, Fileset, FilesetChange\n",
"import os \n",
"\n",
"# Create Gravitino admin client\n",
"gravitino_admin_client = GravitinoAdminClient(uri=\"http://gravitino:8090\")\n",
"\n",
"# Create metalake via Gravitino admin client\n",
"metalake_name=\"default\"\n",
"metalake = gravitino_admin_client.create_metalake(name=metalake_name,\n",
" comment=\"metalake comment\", \n",
" properties={})\n",
"print(metalake)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "876c1ca4-7c41-4c9d-acde-faf2fe8c3bd8",
"metadata": {},
"outputs": [],
"source": [
"# Create Gravitino client\n",
"gravitino_client = GravitinoClient(uri=\"http://gravitino:8090\", metalake_name=metalake_name)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d74be244-fa6a-4065-bed4-ba47bc5cd32e",
"metadata": {},
"outputs": [],
"source": [
"from typing import Dict, List\n",
"from gravitino import GravitinoMetalake\n",
"\n",
"# List all Gravitino metalake entity\n",
"metalake_list: List[GravitinoMetalake] = gravitino_admin_client.list_metalakes()\n",
"print(metalake_list)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6ed1f174-8b21-4191-a1a4-14718eaa87ee",
"metadata": {},
"outputs": [],
"source": [
"# Create catalog via Gravition client\n",
"catalog_name=\"catalog\"\n",
"\n",
"catalog = gravitino_client.create_catalog(name=catalog_name,\n",
" catalog_type=Catalog.Type.FILESET,\n",
" provider=\"hadoop\", \n",
" comment=\"\",\n",
" properties={})\n",
"print(catalog)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "adf8c2ec-17e2-4550-a1cd-20430d59520c",
"metadata": {},
"outputs": [],
"source": [
"# Load catalog entity via Gravition client\n",
"catalog = gravitino_client.load_catalog(name=catalog_name)\n",
"print(catalog)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8fc69b06-0c7f-4fff-a8d8-4f257399af9a",
"metadata": {},
"outputs": [],
"source": [
"# Create schema entity via Gravition client\n",
"schema_name=\"schema\"\n",
"schema_path=\"/user/gravitino/\"+schema_name\n",
"schema_hdfs_path=f\"hdfs://hive:9000{schema_path}\"\n",
"\n",
"catalog.as_schemas().create_schema(schema_name=schema_name, \n",
" comment=\"\", \n",
" properties={\"location\":schema_hdfs_path})\n",
"\n",
"# Check if the schema location was successfully created in HDFS\n",
"try:\n",
" info = hdfs_client.status(schema_path)\n",
" print(f\"Success: The storage location {schema_path} was successfully created.\")\n",
" print(\"Details:\", info)\n",
"except Exception:\n",
" print(f\"Faild: The storage location {schema_path} was not successfully created.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "53ae1b3f-53a0-42c3-b9c2-3ab6ed0defd1",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Create a managed type of Fileset\n",
"managed_fileset_name=\"managed_fileset\"\n",
"managed_fileset_path=\"/user/gravitino/\"+schema_name+\"/\"+managed_fileset_name\n",
"managed_fileset_hdfs_path=f\"hdfs://hive:9000{managed_fileset_path}\"\n",
"\n",
"managed_fileset_ident: NameIdentifier = NameIdentifier.of(schema_name, managed_fileset_name)\n",
"catalog.as_fileset_catalog().create_fileset(ident=managed_fileset_ident,\n",
" fileset_type=Fileset.Type.MANAGED,\n",
" comment=\"\",\n",
" storage_location=managed_fileset_hdfs_path,\n",
" properties={})\n",
"\n",
"# Check if the fileset location was successfully created in HDFS\n",
"try:\n",
" info = hdfs_client.status(managed_fileset_path)\n",
" print(f\"Success: The storage location {managed_fileset_path} was successfully created.\")\n",
" print(\"Details:\", info) # print hdfs path detail informations\n",
"except Exception:\n",
" print(f\"Faild: The storage location {managed_fileset_path} was not successfully created.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "08d7b09f-e732-4c4a-aff4-8f7e0aaa61de",
"metadata": {},
"outputs": [],
"source": [
"external_fileset_name=\"external_fileset\"\n",
"external_fileset_path=\"/user/gravitino/\"+schema_name+\"/\"+external_fileset_name\n",
"external_fileset_hdfs_path=f\"hdfs://hive:9000{external_fileset_path}\"\n",
"\n",
"# Create a fileset path in HDFS in advance\n",
"hdfs_client.makedirs(external_fileset_path)\n",
"try:\n",
" info = hdfs_client.status(external_fileset_path)\n",
" print(f\"Success: The storage location {external_fileset_path} was successfully created.\")\n",
" print(\"Details:\", info) # print hdfs path detail information\n",
"except Exception:\n",
" print(f\"Faild: The storage location {external_fileset_path} was not successfully created.\")\n",
"\n",
"# Create an external type of fileset\n",
"external_fileset_ident: NameIdentifier = NameIdentifier.of(schema_name, external_fileset_name)\n",
"catalog.as_fileset_catalog().create_fileset(ident=external_fileset_ident,\n",
" fileset_type=Fileset.Type.EXTERNAL,\n",
" comment=\"\",\n",
" storage_location=external_fileset_hdfs_path,\n",
" properties={})"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "097454c8-0214-4466-ac9e-164f88929853",
"metadata": {},
"outputs": [],
"source": [
"# List all fileset\n",
"catalog = gravitino_client.load_catalog(name=catalog_name)\n",
"fileset_list: List[NameIdentifier] = catalog.as_fileset_catalog().list_filesets(namespace=managed_fileset_ident.namespace())\n",
"print(fileset_list)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9e4761eb-9533-47f0-9078-f7efdd3a01ee",
"metadata": {},
"outputs": [],
"source": [
"# Load managed fileset\n",
"managed_fileset=gravitino_client.load_catalog(name=catalog_name).as_fileset_catalog().load_fileset(ident=managed_fileset_ident)\n",
"print(managed_fileset)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9ad95186-b16e-483a-97ef-3a4ddec49033",
"metadata": {},
"outputs": [],
"source": [
"# Load external fileset\n",
"external_fileset=gravitino_client.load_catalog(name=catalog_name).as_fileset_catalog().load_fileset(ident=external_fileset_ident)\n",
"print(external_fileset)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8164406c-808c-47f4-84da-b708dc1d3d3e",
"metadata": {},
"outputs": [],
"source": [
"# Drop managed type of fileset and deleted HDFS location\n",
"catalog.as_fileset_catalog().drop_fileset(ident=managed_fileset_ident)\n",
"\n",
"# Check managed type of fileset location if successfully deleted\n",
"try:\n",
" info = hdfs_client.status(managed_fileset_path)\n",
" print(f\"Faild: The storage location {managed_fileset_path} was not successfully deleted.\")\n",
"except Exception:\n",
" print(f\"Success: The storage location {managed_fileset_path} was successfully deleted.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "741e35d2-676f-4b97-b47b-e5374168d1ef",
"metadata": {},
"outputs": [],
"source": [
"# Drop external type of fileset, Should not be deleted HDFS location\n",
"catalog.as_fileset_catalog().drop_fileset(ident=external_fileset_ident)\n",
"\n",
"# Check managed type of fileset location if reserved\n",
"try:\n",
" info = hdfs_client.status(external_fileset_path)\n",
" print(f\"Success: The storage location {external_fileset_path} reserved.\")\n",
"except Exception:\n",
" print(f\"Faild: The storage location {external_fileset_path} deleted.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "db137c72-e6dd-4e07-bc09-1b18803a6e16",
"metadata": {},
"outputs": [],
"source": [
"# Drop schema\n",
"catalog.as_schemas().drop_schema(schema_name=schema_name, cascade=True)\n",
"\n",
"# Check schema location if successfully deleted\n",
"try:\n",
" info = hdfs_client.status(schema_path)\n",
" print(f\"Faild: The storage location {schema_path} was not successfully deleted.\")\n",
"except Exception:\n",
" print(f\"Success: The storage location {schema_path} was successfully deleted.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cb4cfd65-6d8c-4b67-b8ad-7aad991888f9",
"metadata": {},
"outputs": [],
"source": [
"# Drop catalog\n",
"result=gravitino_client.drop_catalog(name=catalog_name, force=True)\n",
"print(result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "00309040-edd0-4d48-ab36-a6c598294ed7",
"metadata": {},
"outputs": [],
"source": [
"# Drop metalake\n",
"result=gravitino_admin_client.drop_metalake(metalake_name, force=True)\n",
"print(result)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}