# Document to Structured Data

In [1]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Set Up Azure OpenAI

In [2]:
import os
import openai
from dotenv import load_dotenv

print('openai version: ', openai.__version__)

# Set up Azure OpenAI
load_dotenv()
openai.api_type = "azure"
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = "2023-03-15-preview"
openai.api_key = os.getenv("OPENAI_API_KEY")

openai version:  0.27.6


True

## Deploy a Model

In [3]:
# id of desired_model
desired_model = 'gpt-4-32k' 
desired_capability = 'chat_completion' # apply as completion, since gpt-4 is only released as chat in Azure OpenAI

# list models deployed with
deployment_id = None
result = openai.Deployment.list()

for deployment in result.data:
    if deployment["status"] != "succeeded":
        continue
    
    model = openai.Model.retrieve(deployment["model"])
    print(model)
    # check if desired_model is deployed, and if it has 'completion' capability
    if model["id"] == desired_model and model['capabilities'][desired_capability]:
        deployment_id = deployment["id"]
        
# if no model deployed, deploy one
if not deployment_id:
    print('No deployment with status: succeeded found.')

    # Deploy the model
    print(f'Creating a new deployment with model: {desired_model}')
    result = openai.Deployment.create(model=desired_model, scale_settings={"scale_type":"standard"})
    deployment_id = result["id"]
    print(f'Successfully created {desired_model} that supports text {desired_capability} with id: {deployment_id}.')
else:
    print(f'Found a succeeded deployment of "{desired_model}" that supports text {desired_capability} with id: {deployment_id}.')

{
  "capabilities": {
    "chat_completion": true,
    "completion": false,
    "embeddings": false,
    "fine_tune": false,
    "inference": true,
    "scale_types": [
      "standard"
    ]
  },
  "created_at": 1679356800,
  "deprecation": {
    "inference": 1742515200
  },
  "id": "gpt-4-32k",
  "lifecycle_status": "preview",
  "object": "model",
  "status": "succeeded",
  "updated_at": 1679356800
}
{
  "capabilities": {
    "chat_completion": true,
    "completion": false,
    "embeddings": false,
    "fine_tune": false,
    "inference": true,
    "scale_types": [
      "standard"
    ]
  },
  "created_at": 1679356800,
  "deprecation": {
    "inference": 1742515200
  },
  "id": "gpt-4",
  "lifecycle_status": "preview",
  "object": "model",
  "status": "succeeded",
  "updated_at": 1679356800
}
{
  "capabilities": {
    "chat_completion": false,
    "completion": true,
    "embeddings": false,
    "fine_tune": false,
    "inference": true,
    "scale_types": [
      "standard"
    ]


## Request API

In [5]:
response = openai.ChatCompletion.create(
  engine = deployment_id,
  messages = [{"role":"user","content":"who are you?"},],
  temperature=0.5,
  max_tokens=26693,
  top_p=0.95,
  frequency_penalty=0,
  presence_penalty=0,
  stop=None)

response

<OpenAIObject chat.completion id=chatcmpl-7EHCepJywVdvI7k1KBHqJfQtneDg8 at 0x7f5aa0cc40e0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "I am an AI language model created by OpenAI, called GPT-3. My purpose is to assist users in answering questions, providing information, and engaging in conversation.",
        "role": "assistant"
      }
    }
  ],
  "created": 1683637076,
  "id": "chatcmpl-7EHCepJywVdvI7k1KBHqJfQtneDg8",
  "model": "gpt-4-32k",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 36,
    "prompt_tokens": 10,
    "total_tokens": 46
  }
}

In [6]:
response['choices'][0]['message']['role']
response['choices'][0]['message']['content']

'assistant'

'I am an AI language model created by OpenAI, called GPT-3. My purpose is to assist users in answering questions, providing information, and engaging in conversation.'

In [7]:
def request_api(messages, deployment_id):
    response = openai.ChatCompletion.create(
        engine = deployment_id,
        messages = messages,
        temperature=0,
        max_tokens=15000,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        stop='###')
    return response

## Get Structured Data

In [8]:
def get_structured_data(document, prompt_postfix, deployment_id):
    content = prompt_postfix.replace('<document>', document)
    messages = [{"role":"user","content":content},]; #print(messages)

    structured_data = request_api(messages, deployment_id)
    return structured_data

### Document Type: Resume

In [9]:
fname = "../data/resume.txt"

with open(fname, 'r') as f:
    document = f.readlines()

# convert list to str
document = ' '.join(document); #print(document)

In [12]:
# prompt
prompt_postfix = """ <document>
  \n###
  \nExtract the key sections from the resume above into json.
"""
structured_data = get_structured_data(document, prompt_postfix, deployment_id)

In [13]:
print(structured_data['choices'][0]['message']['content'])

{
  "Contact": {
    "Email": "chew.yean.yam@gmail.com",
    "LinkedIn": "www.linkedin.com/in/cyyam"
  },
  "Top Skills": [
    "Research",
    "Microarray Analysis",
    "OpenCV"
  ],
  "Languages": {
    "English": "Native or Bilingual",
    "Malay": "Professional Working",
    "Mandarin": "Native or Bilingual",
    "Cantonese": "Native or Bilingual"
  },
  "Certifications": [
    "Uncovering Your Authentic Self at Work",
    "Worldwide Communities - Community SME 2018",
    "Fred Kofman on Managing Conflict"
  ],
  "Experience": {
    "Microsoft": {
      "Principal Data and Applied Scientist": {
        "Duration": "December 2021 - Present (1 year 6 months)",
        "Responsibilities": "Design and build AI applications with enterprises in Latin America Region across industries. Advise on their AI strategies, identify commercial opportunities and code with them to deployment. Enhance skills of team of Cloud Solution Architect specialised in Machine Learning through collaboration an