use_cases/archive/document_to_structured_data/notebooks/document-to-structured-data.ipynb

{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Document to Structured Data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "\n", "from IPython.core.interactiveshell import InteractiveShell\n", "InteractiveShell.ast_node_interactivity = \"all\"" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Set Up Azure OpenAI" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "openai version: 0.27.6\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import openai\n", "from dotenv import load_dotenv\n", "\n", "print('openai version: ', openai.__version__)\n", "\n", "# Set up Azure OpenAI\n", "load_dotenv()\n", "openai.api_type = \"azure\"\n", "openai.api_base = os.getenv(\"OPENAI_API_BASE\")\n", "openai.api_version = \"2023-03-15-preview\"\n", "openai.api_key = os.getenv(\"OPENAI_API_KEY\")" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Deploy a Model" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"capabilities\": {\n", " \"chat_completion\": true,\n", " \"completion\": false,\n", " \"embeddings\": false,\n", " \"fine_tune\": false,\n", " \"inference\": true,\n", " \"scale_types\": [\n", " \"standard\"\n", " ]\n", " },\n", " \"created_at\": 1679356800,\n", " \"deprecation\": {\n", " \"inference\": 1742515200\n", " },\n", " \"id\": \"gpt-4-32k\",\n", " \"lifecycle_status\": \"preview\",\n", " \"object\": \"model\",\n", " \"status\": \"succeeded\",\n", " \"updated_at\": 1679356800\n", "}\n", "{\n", " \"capabilities\": {\n", " \"chat_completion\": true,\n", " \"completion\": false,\n", " \"embeddings\": false,\n", " \"fine_tune\": false,\n", " \"inference\": true,\n", " \"scale_types\": [\n", " \"standard\"\n", " ]\n", " },\n", " \"created_at\": 1679356800,\n", " \"deprecation\": {\n", " \"inference\": 1742515200\n", " },\n", " \"id\": \"gpt-4\",\n", " \"lifecycle_status\": \"preview\",\n", " \"object\": \"model\",\n", " \"status\": \"succeeded\",\n", " \"updated_at\": 1679356800\n", "}\n", "{\n", " \"capabilities\": {\n", " \"chat_completion\": false,\n", " \"completion\": true,\n", " \"embeddings\": false,\n", " \"fine_tune\": false,\n", " \"inference\": true,\n", " \"scale_types\": [\n", " \"standard\"\n", " ]\n", " },\n", " \"created_at\": 1664496000,\n", " \"deprecation\": {\n", " \"inference\": 1727654400\n", " },\n", " \"id\": \"text-davinci-003\",\n", " \"lifecycle_status\": \"preview\",\n", " \"object\": \"model\",\n", " \"status\": \"succeeded\",\n", " \"updated_at\": 1664496000\n", "}\n", "Found a succeeded deployment of \"gpt-4-32k\" that supports text chat_completion with id: gpt-4-32k.\n" ] } ], "source": [ "# id of desired_model\n", "desired_model = 'gpt-4-32k' \n", "desired_capability = 'chat_completion' # apply as completion, since gpt-4 is only released as chat in Azure OpenAI\n", "\n", "# list models deployed with\n", "deployment_id = None\n", "result = openai.Deployment.list()\n", "\n", "for deployment in result.data:\n", " if deployment[\"status\"] != \"succeeded\":\n", " continue\n", " \n", " model = openai.Model.retrieve(deployment[\"model\"])\n", " print(model)\n", " # check if desired_model is deployed, and if it has 'completion' capability\n", " if model[\"id\"] == desired_model and model['capabilities'][desired_capability]:\n", " deployment_id = deployment[\"id\"]\n", " \n", "# if no model deployed, deploy one\n", "if not deployment_id:\n", " print('No deployment with status: succeeded found.')\n", "\n", " # Deploy the model\n", " print(f'Creating a new deployment with model: {desired_model}')\n", " result = openai.Deployment.create(model=desired_model, scale_settings={\"scale_type\":\"standard\"})\n", " deployment_id = result[\"id\"]\n", " print(f'Successfully created {desired_model} that supports text {desired_capability} with id: {deployment_id}.')\n", "else:\n", " print(f'Found a succeeded deployment of \"{desired_model}\" that supports text {desired_capability} with id: {deployment_id}.')" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Request API" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<OpenAIObject chat.completion id=chatcmpl-7EHCepJywVdvI7k1KBHqJfQtneDg8 at 0x7f5aa0cc40e0> JSON: {\n", " \"choices\": [\n", " {\n", " \"finish_reason\": \"stop\",\n", " \"index\": 0,\n", " \"message\": {\n", " \"content\": \"I am an AI language model created by OpenAI, called GPT-3. My purpose is to assist users in answering questions, providing information, and engaging in conversation.\",\n", " \"role\": \"assistant\"\n", " }\n", " }\n", " ],\n", " \"created\": 1683637076,\n", " \"id\": \"chatcmpl-7EHCepJywVdvI7k1KBHqJfQtneDg8\",\n", " \"model\": \"gpt-4-32k\",\n", " \"object\": \"chat.completion\",\n", " \"usage\": {\n", " \"completion_tokens\": 36,\n", " \"prompt_tokens\": 10,\n", " \"total_tokens\": 46\n", " }\n", "}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "response = openai.ChatCompletion.create(\n", " engine = deployment_id,\n", " messages = [{\"role\":\"user\",\"content\":\"who are you?\"},],\n", " temperature=0.5,\n", " max_tokens=26693,\n", " top_p=0.95,\n", " frequency_penalty=0,\n", " presence_penalty=0,\n", " stop=None)\n", "\n", "response" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'assistant'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "'I am an AI language model created by OpenAI, called GPT-3. My purpose is to assist users in answering questions, providing information, and engaging in conversation.'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "response['choices'][0]['message']['role']\n", "response['choices'][0]['message']['content']" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def request_api(messages, deployment_id):\n", " response = openai.ChatCompletion.create(\n", " engine = deployment_id,\n", " messages = messages,\n", " temperature=0,\n", " max_tokens=15000,\n", " top_p=0.95,\n", " frequency_penalty=0,\n", " presence_penalty=0,\n", " stop='###')\n", " return response" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Get Structured Data" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def get_structured_data(document, prompt_postfix, deployment_id):\n", " content = prompt_postfix.replace('<document>', document)\n", " messages = [{\"role\":\"user\",\"content\":content},]; #print(messages)\n", "\n", " structured_data = request_api(messages, deployment_id)\n", " return structured_data" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Document Type: Resume" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "fname = \"../data/resume.txt\"\n", "\n", "with open(fname, 'r') as f:\n", " document = f.readlines()\n", "\n", "# convert list to str\n", "document = ' '.join(document); #print(document)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# prompt\n", "prompt_postfix = \"\"\" <document>\n", " \\n###\n", " \\nExtract the key sections from the resume above into json.\n", "\"\"\"\n", "structured_data = get_structured_data(document, prompt_postfix, deployment_id)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"Contact\": {\n", " \"Email\": \"chew.yean.yam@gmail.com\",\n", " \"LinkedIn\": \"www.linkedin.com/in/cyyam\"\n", " },\n", " \"Top Skills\": [\n", " \"Research\",\n", " \"Microarray Analysis\",\n", " \"OpenCV\"\n", " ],\n", " \"Languages\": {\n", " \"English\": \"Native or Bilingual\",\n", " \"Malay\": \"Professional Working\",\n", " \"Mandarin\": \"Native or Bilingual\",\n", " \"Cantonese\": \"Native or Bilingual\"\n", " },\n", " \"Certifications\": [\n", " \"Uncovering Your Authentic Self at Work\",\n", " \"Worldwide Communities - Community SME 2018\",\n", " \"Fred Kofman on Managing Conflict\"\n", " ],\n", " \"Experience\": {\n", " \"Microsoft\": {\n", " \"Principal Data and Applied Scientist\": {\n", " \"Duration\": \"December 2021 - Present (1 year 6 months)\",\n", " \"Responsibilities\": \"Design and build AI applications with enterprises in Latin America Region across industries. Advise on their AI strategies, identify commercial opportunities and code with them to deployment. Enhance skills of team of Cloud Solution Architect specialised in Machine Learning through collaboration and education.\"\n", " },\n", " \"Principal Data and Applied Scientist\": {\n", " \"Duration\": \"December 2019 - December 2021 (2 years 1 month)\",\n", " \"Responsibilities\": \"Design and build AI Capabilities with enterprises. AI Incubator. Media Industry aligned with the MovieLabs Ontology for Media Creation. Gaming Industry Customer Churn and Data Analytics capabilities for Gaming Platform Playfab.\"\n", " },\n", " \"Principal Data and Applied Science Manager\": {\n", " \"Duration\": \"May 2018 - November 2019 (1 year 7 months)\",\n", " \"Responsibilities\": \"Lead a Data Science team. Design and build many AI applications with enterprises in the EMEA region across industries. Advise on their AI strategies, identify commercial opportunities and code with them to deployment. Translate business requirements into machine learning solution.\"\n", " },\n", " \"Principal Data and Applied Scientist\": {\n", " \"Duration\": \"April 2015 - April 2018 (3 years 1 month)\",\n", " \"Responsibilities\": \"Design and build many AI applications with enterprises world-wide across industries. Advise on their AI strategies, identify commercial opportunities and code with them to deployment. Translate business requirements into machine learning solution.\"\n", " }\n", " },\n", " \"Data Science and Machine Learning Study Group (MeetUp)\": {\n", " \"Co-Founder/Organizer\": {\n", " \"Duration\": \"November 2019 - Present (3 years 7 months)\",\n", " \"Responsibilities\": \"Empower every single person in Bristol and surrounding areas to achieve quality Machine Learning and Data Science knowledge.\"\n", " }\n", " },\n", " \"University of Bristol\": {\n", " \"External Advisory Board Member\": {\n", " \"Duration\": \"January 2019 - Present (4 years 5 months)\",\n", " \"Responsibilities\": \"Voluntary. Advice on Data Science and Machine Learning related courses, bringing academia and industry closer, diversity and inclusion.\"\n", " }\n", " },\n", " \"BAE Systems\": {\n", " \"Principal Research Scientist\": {\n", " \"Duration\": \"April 2013 - March 2015 (2 years)\",\n", " \"Responsibilities\": \"Text Document Analysis, Event Detection and Extraction, Health Impact Assessment, Unmanned Aerial Vehicle (UAV) - ASTREA.\"\n", " }\n", " },\n", " \"Hewlett-Packard Laboratories\": {\n", " \"Research Scientist\": {\n", " \"Duration\": \"November 2010 - October 2012 (2 years)\",\n", " \"Responsibilities\": \"Cloud Computing Stewardship and Economics, Discrete event model and simulation of interaction within cloud computing ecosystem.\"\n", " }\n", " },\n", " \"National Physical Laboratory\": {\n", " \"Senior Scientist\": {\n", " \"Duration\": \"October 2007 - March 2010 (2 years 6 months)\",\n", " \"Responsibilities\": \"Biometrics, Anti-Counterfeit, Error Analysis in 3-D Face Reconstruction, Image Correlation, Uncertainty propagation through an imaging system, Business Development.\"\n", " }\n", " },\n", " \"University of Southampton, Electronics and Computer Science\": {\n", " \"Research Scientist\": {\n", " \"Duration\": \"January 2007 - September 2007 (9 months)\",\n", " \"Responsibilities\": \"Human Motion, Biometrics, Course delivery and project formulation.\"\n", " }\n", " },\n", " \"Intel\": {\n", " \"Senior Technical Marketing Engineer\": {\n", " \"Duration\": \"August 2004 - November 2006 (2 years 4 months)\",\n", " \"Responsibilities\": \"Digital Security Surveillance, Business Development and Partnership, Market Awareness for Emerging Market, Authored technical articles, Developed demonstration kit.\"\n", " }\n", " },\n", " \"A*STAR - Agency for Science, Technology and Research\": {\n", " \"Research Fellow\": {\n", " \"Duration\": \"April 2003 - July 2004 (1 year 4 months)\",\n", " \"Responsibilities\": \"Signal processing, statistics, Micro-array Analysis, Bioinformatics, Requirement Analysis and Software Design.\"\n", " }\n", " },\n", " \"University of Southampton - Electronics and Computer Science\": {\n", " \"Research Consultant\": {\n", " \"Duration\": \"January 2002 - December 2002 (1 year)\",\n", " \"Responsibilities\": \"Computer vision, time-series analytic, pattern recognition, laboratory design, large data collection and processing, Invented first model-based walking and running gait biometrics system for person, gender and age-group identification.\"\n", " }\n", " }\n", " },\n", " \"Education\": {\n", " \"University of Southampton\": {\n", " \"Ph.D.\": {\n", " \"Field\": \"Computer Vision, Image Processing, Human Motion Modelling, Biometrics\",\n", " \"Duration\": \"1999 - 2002\"\n", " },\n", " \"B. Eng.\": {\n", " \"Field\": \"Computer Engineering\",\n", " \"Duration\": \"1997 - 1999\"\n", " }\n", " }\n", " }\n", "}\n" ] } ], "source": [ "print(structured_data['choices'][0]['message']['content'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "azureml_py38", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "6d65a8c07f5b6469e0fc613f182488c0dccce05038bbda39e5ac9075c0454d11" } } }, "nbformat": 4, "nbformat_minor": 2 }

use_cases/archive/document_to_structured_data/notebooks/document-to-structured-data.ipynb (512 lines of code) (raw):