def create_index()

in Autogen_v0.4/rag_agent/search_helper.py [0:0]


def create_index(index_name: str, analyzer_name: str = "en.microsoft", language_suffix: str = "en"):
        index_schema = {
        "name": index_name,
        "fields": [
            {
                "name": "id",
                "type": "Edm.String",
                "key": True,
                "sortable": True,
                "filterable": True,
                "facetable": True
            },
            {
                "name": "docName",
                "type": "Edm.String",
                "searchable": True
            },
            {
                "name": "pageNumber",
                "type": "Edm.String",
                "searchable": True
            },
            {
                "name": f"title_{language_suffix}",
                "type": "Edm.String",
                "analyzer": analyzer_name,
                "searchable": True
            },
            {
                "name": f"content_{language_suffix}",
                "type": "Edm.String",
                "analyzer": analyzer_name,
                "searchable": True
            },
            {
                "name": f"category_{language_suffix}",
                "type": "Collection(Edm.String)",
                "analyzer": analyzer_name,
                "filterable": True,
                "searchable": True
            },
            {
                "name": f"tags_{language_suffix}",
                "type": "Collection(Edm.String)",
                "analyzer": analyzer_name,
                "filterable": True,
                "searchable": True
            },
            {
                "name": "lastUpdated",
                "type": "Edm.DateTimeOffset"
            
            },
            {
                "name": "titleVector",
                "type": "Collection(Edm.Single)",
                "searchable": True,
                "dimensions": 1536,
                "vectorSearchProfile": "amlHnswProfile",
            },
            {
                "name": "contentVector",
                "type": "Collection(Edm.Single)",
                "searchable": True,
                "dimensions": 3072,
                "vectorSearchProfile": "amlHnswProfile",
            },
            {
                "name": "categoryVector",
                "type": "Collection(Edm.Single)",
            "searchable": True,
                "dimensions": 1536,
                "vectorSearchProfile": "amlHnswProfile",
            },
            {
                "name": "tagsVector",
                "type": "Collection(Edm.Single)",
                "searchable": True,
                "dimensions": 1536,
                "vectorSearchProfile": "amlHnswProfile",
            }
        ],
        "scoringProfiles": [
            {
            "name": "tagsBoost",
            "text": {
                "weights": {
                f"tags_{language_suffix}": 5
                }
            },
                "functions": []
            },
            {
            "name": "newAndLatest",
            "functionAggregation": "sum",
            "functions": [
                {
                    "fieldName": "lastUpdated",
                    "interpolation": "quadratic",
                    "type": "freshness",
                    "boost": 10,
                    "freshness": {
                            "boostingDuration": "P365D"
                        }
            
                }
            ]
            }
        ],
        "suggesters": [
            {
                "name": "sg",
                "searchMode": "analyzingInfixMatching",
                "sourceFields": [f"title_{language_suffix}"]
            }
        ],
        "vectorSearch": {
                "algorithms": [
                    {
                        "name": "amlHnsw",
                        "kind": "hnsw",
                        "hnswParameters": {
                        "m": 4,
                        "metric": "cosine"
                        }
                    }
                
                ],
                "profiles": [
                    {
                        "name": "amlHnswProfile",
                        "algorithm": "amlHnsw",
                        "vectorizer": "amlVectorizer"
                    }
                
                ], 
                "vectorizers": [
                    {
                        "name":"amlVectorizer",
                        "kind":"azureOpenAI",
                        "azureOpenAIParameters": {
                            "resourceUri": azure_openai_endpoint,
                            "deploymentId": azure_openai_embedding__large_deployment,
                            "modelName": embedding_model_name,
                            "apiKey": azure_openai_key
                        }
                    }
                ]
                
    },
        "semantic": {
            "configurations": [
                {
                    "name": "aml-semantic-config",
                    "prioritizedFields": {
                        "titleField": {
                            "fieldName": f"title_{language_suffix}"
                        },
                        "prioritizedKeywordsFields": [
                            {
                                "fieldName": f"category_{language_suffix}"
                            },
                            {
                                "fieldName": f"tags_{language_suffix}"
                            }
                        ],
                        "prioritizedContentFields": [
                            {
                                "fieldName": f"content_{language_suffix}"
                            }
                        ]
                    }
                }
            ]
        }
    }



        headers = {'Content-Type': 'application/json',
                'api-key': os.getenv("AZURE_SEARCH_ADMIN_KEY", "") }
        # Create Index
        url = azure_search_endpoint + "/indexes/" + index_name + "?api-version=2024-07-01"


        response = requests.get(url, headers=headers)
        if response.status_code == 404:
            response  = requests.put(url, headers=headers, json=index_schema)
            index = response.json()
            print(index)
        else:
            print("Index already exists")