init/jupyter/gravitino-fileset-example.ipynb (340 lines of code) (raw):

{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "a7921ace-4d57-4a8e-934c-7e49a4f268e6", "metadata": {}, "outputs": [], "source": [ "pip install hdfs" ] }, { "cell_type": "code", "execution_count": null, "id": "db993abd-02bd-465b-a94a-ad7ed7e0e234", "metadata": {}, "outputs": [], "source": [ "from hdfs import InsecureClient\n", "import os\n", "\n", "# Create a HDFS connector client\n", "hdfs_client = InsecureClient(\"http://hive:50070\", user='root')\n", "\n", "# List HDFS file and directories\n", "print(hdfs_client.list('/user/gravitino'))\n", "\n", "hdfs_client.delete(\"/user/gravitino\")" ] }, { "cell_type": "code", "execution_count": null, "id": "fcfb73be-e369-4543-b78d-fb1cd061c9e1", "metadata": {}, "outputs": [], "source": [ "pip install apache-gravitino==0.8.0" ] }, { "cell_type": "code", "execution_count": null, "id": "384715f6-a200-448f-b3b2-bf03931769bf", "metadata": {}, "outputs": [], "source": [ "from typing import Dict, List\n", "from gravitino import NameIdentifier, GravitinoAdminClient, GravitinoClient, Catalog, Fileset, FilesetChange\n", "import os \n", "\n", "# Create Gravitino admin client\n", "gravitino_admin_client = GravitinoAdminClient(uri=\"http://gravitino:8090\")\n", "\n", "# Create metalake via Gravitino admin client\n", "metalake_name=\"default\"\n", "metalake = gravitino_admin_client.create_metalake(name=metalake_name,\n", " comment=\"metalake comment\", \n", " properties={})\n", "print(metalake)" ] }, { "cell_type": "code", "execution_count": null, "id": "876c1ca4-7c41-4c9d-acde-faf2fe8c3bd8", "metadata": {}, "outputs": [], "source": [ "# Create Gravitino client\n", "gravitino_client = GravitinoClient(uri=\"http://gravitino:8090\", metalake_name=metalake_name)" ] }, { "cell_type": "code", "execution_count": null, "id": "d74be244-fa6a-4065-bed4-ba47bc5cd32e", "metadata": {}, "outputs": [], "source": [ "from typing import Dict, List\n", "from gravitino import GravitinoMetalake\n", "\n", "# List all Gravitino metalake entity\n", "metalake_list: List[GravitinoMetalake] = gravitino_admin_client.list_metalakes()\n", "print(metalake_list)" ] }, { "cell_type": "code", "execution_count": null, "id": "6ed1f174-8b21-4191-a1a4-14718eaa87ee", "metadata": {}, "outputs": [], "source": [ "# Create catalog via Gravition client\n", "catalog_name=\"catalog\"\n", "\n", "catalog = gravitino_client.create_catalog(name=catalog_name,\n", " catalog_type=Catalog.Type.FILESET,\n", " provider=\"hadoop\", \n", " comment=\"\",\n", " properties={})\n", "print(catalog)" ] }, { "cell_type": "code", "execution_count": null, "id": "adf8c2ec-17e2-4550-a1cd-20430d59520c", "metadata": {}, "outputs": [], "source": [ "# Load catalog entity via Gravition client\n", "catalog = gravitino_client.load_catalog(name=catalog_name)\n", "print(catalog)" ] }, { "cell_type": "code", "execution_count": null, "id": "8fc69b06-0c7f-4fff-a8d8-4f257399af9a", "metadata": {}, "outputs": [], "source": [ "# Create schema entity via Gravition client\n", "schema_name=\"schema\"\n", "schema_path=\"/user/gravitino/\"+schema_name\n", "schema_hdfs_path=f\"hdfs://hive:9000{schema_path}\"\n", "\n", "catalog.as_schemas().create_schema(schema_name=schema_name, \n", " comment=\"\", \n", " properties={\"location\":schema_hdfs_path})\n", "\n", "# Check if the schema location was successfully created in HDFS\n", "try:\n", " info = hdfs_client.status(schema_path)\n", " print(f\"Success: The storage location {schema_path} was successfully created.\")\n", " print(\"Details:\", info)\n", "except Exception:\n", " print(f\"Faild: The storage location {schema_path} was not successfully created.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "53ae1b3f-53a0-42c3-b9c2-3ab6ed0defd1", "metadata": {}, "outputs": [], "source": [ "\n", "# Create a managed type of Fileset\n", "managed_fileset_name=\"managed_fileset\"\n", "managed_fileset_path=\"/user/gravitino/\"+schema_name+\"/\"+managed_fileset_name\n", "managed_fileset_hdfs_path=f\"hdfs://hive:9000{managed_fileset_path}\"\n", "\n", "managed_fileset_ident: NameIdentifier = NameIdentifier.of(schema_name, managed_fileset_name)\n", "catalog.as_fileset_catalog().create_fileset(ident=managed_fileset_ident,\n", " fileset_type=Fileset.Type.MANAGED,\n", " comment=\"\",\n", " storage_location=managed_fileset_hdfs_path,\n", " properties={})\n", "\n", "# Check if the fileset location was successfully created in HDFS\n", "try:\n", " info = hdfs_client.status(managed_fileset_path)\n", " print(f\"Success: The storage location {managed_fileset_path} was successfully created.\")\n", " print(\"Details:\", info) # print hdfs path detail informations\n", "except Exception:\n", " print(f\"Faild: The storage location {managed_fileset_path} was not successfully created.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "08d7b09f-e732-4c4a-aff4-8f7e0aaa61de", "metadata": {}, "outputs": [], "source": [ "external_fileset_name=\"external_fileset\"\n", "external_fileset_path=\"/user/gravitino/\"+schema_name+\"/\"+external_fileset_name\n", "external_fileset_hdfs_path=f\"hdfs://hive:9000{external_fileset_path}\"\n", "\n", "# Create a fileset path in HDFS in advance\n", "hdfs_client.makedirs(external_fileset_path)\n", "try:\n", " info = hdfs_client.status(external_fileset_path)\n", " print(f\"Success: The storage location {external_fileset_path} was successfully created.\")\n", " print(\"Details:\", info) # print hdfs path detail information\n", "except Exception:\n", " print(f\"Faild: The storage location {external_fileset_path} was not successfully created.\")\n", "\n", "# Create an external type of fileset\n", "external_fileset_ident: NameIdentifier = NameIdentifier.of(schema_name, external_fileset_name)\n", "catalog.as_fileset_catalog().create_fileset(ident=external_fileset_ident,\n", " fileset_type=Fileset.Type.EXTERNAL,\n", " comment=\"\",\n", " storage_location=external_fileset_hdfs_path,\n", " properties={})" ] }, { "cell_type": "code", "execution_count": null, "id": "097454c8-0214-4466-ac9e-164f88929853", "metadata": {}, "outputs": [], "source": [ "# List all fileset\n", "catalog = gravitino_client.load_catalog(name=catalog_name)\n", "fileset_list: List[NameIdentifier] = catalog.as_fileset_catalog().list_filesets(namespace=managed_fileset_ident.namespace())\n", "print(fileset_list)" ] }, { "cell_type": "code", "execution_count": null, "id": "9e4761eb-9533-47f0-9078-f7efdd3a01ee", "metadata": {}, "outputs": [], "source": [ "# Load managed fileset\n", "managed_fileset=gravitino_client.load_catalog(name=catalog_name).as_fileset_catalog().load_fileset(ident=managed_fileset_ident)\n", "print(managed_fileset)" ] }, { "cell_type": "code", "execution_count": null, "id": "9ad95186-b16e-483a-97ef-3a4ddec49033", "metadata": {}, "outputs": [], "source": [ "# Load external fileset\n", "external_fileset=gravitino_client.load_catalog(name=catalog_name).as_fileset_catalog().load_fileset(ident=external_fileset_ident)\n", "print(external_fileset)" ] }, { "cell_type": "code", "execution_count": null, "id": "8164406c-808c-47f4-84da-b708dc1d3d3e", "metadata": {}, "outputs": [], "source": [ "# Drop managed type of fileset and deleted HDFS location\n", "catalog.as_fileset_catalog().drop_fileset(ident=managed_fileset_ident)\n", "\n", "# Check managed type of fileset location if successfully deleted\n", "try:\n", " info = hdfs_client.status(managed_fileset_path)\n", " print(f\"Faild: The storage location {managed_fileset_path} was not successfully deleted.\")\n", "except Exception:\n", " print(f\"Success: The storage location {managed_fileset_path} was successfully deleted.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "741e35d2-676f-4b97-b47b-e5374168d1ef", "metadata": {}, "outputs": [], "source": [ "# Drop external type of fileset, Should not be deleted HDFS location\n", "catalog.as_fileset_catalog().drop_fileset(ident=external_fileset_ident)\n", "\n", "# Check managed type of fileset location if reserved\n", "try:\n", " info = hdfs_client.status(external_fileset_path)\n", " print(f\"Success: The storage location {external_fileset_path} reserved.\")\n", "except Exception:\n", " print(f\"Faild: The storage location {external_fileset_path} deleted.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "db137c72-e6dd-4e07-bc09-1b18803a6e16", "metadata": {}, "outputs": [], "source": [ "# Drop schema\n", "catalog.as_schemas().drop_schema(schema_name=schema_name, cascade=True)\n", "\n", "# Check schema location if successfully deleted\n", "try:\n", " info = hdfs_client.status(schema_path)\n", " print(f\"Faild: The storage location {schema_path} was not successfully deleted.\")\n", "except Exception:\n", " print(f\"Success: The storage location {schema_path} was successfully deleted.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "cb4cfd65-6d8c-4b67-b8ad-7aad991888f9", "metadata": {}, "outputs": [], "source": [ "# Drop catalog\n", "result=gravitino_client.drop_catalog(name=catalog_name, force=True)\n", "print(result)" ] }, { "cell_type": "code", "execution_count": null, "id": "00309040-edd0-4d48-ab36-a6c598294ed7", "metadata": {}, "outputs": [], "source": [ "# Drop metalake\n", "result=gravitino_admin_client.drop_metalake(metalake_name, force=True)\n", "print(result)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }