Unstructured Platform
Getting started with Platform
Using Platform
Azure AI Search
Send processed data from Unstructured to Azure AI Search.
The requirements are as follows.
The following video shows how to fulfill the minimum set of Azure AI Search requirements:
Here are some more details about these requirements:
-
The endpoint and API key for Azure AI Search. Create an endpoint and API key.
-
The name of the index in Azure AI Search. Create an index.
The Azure AI Search index that you use must have an index schema that is compatible with the schema of the documents that Unstructured produces for you. Unstructured cannot provide a schema that is guaranteed to work in all circumstances. This is because these schemas will vary based on your source files’ types; how you want Unstructured to partition, chunk, and generate embeddings; any custom post-processing code that you run; and other factors.
You can adapt the following index schema example for your own needs:
{ "@odata.context": "https://ingest-test-azure-ai-search.search.windows.net/$metadata#indexes/$entity", "@odata.etag": "\"0x8DCED5D96393CA9\"", "name": "<my-index-name>", "defaultScoringProfile": null, "fields": [ { "name": "id", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": true, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "record_id", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": true, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "element_id", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "text", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "embeddings", "type": "Collection(Edm.Single)", "searchable": true, "filterable": false, "retrievable": true, "stored": true, "sortable": false, "facetable": false, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": 3072, "vectorSearchProfile": "embeddings-config-profile", "vectorEncoding": null, "synonymMaps": [] }, { "name": "type", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "metadata", "type": "Edm.ComplexType", "fields": [ { "name": "category_depth", "type": "Edm.Int32", "searchable": false, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "parent_id", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "attached_to_filename", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "filetype", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "last_modified", "type": "Edm.DateTimeOffset", "searchable": false, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "is_continuation", "type": "Edm.Boolean", "searchable": false, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "file_directory", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "filename", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "data_source", "type": "Edm.ComplexType", "fields": [ { "name": "url", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "version", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "date_created", "type": "Edm.DateTimeOffset", "searchable": false, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "date_modified", "type": "Edm.DateTimeOffset", "searchable": false, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "date_processed", "type": "Edm.DateTimeOffset", "searchable": false, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "permissions_data", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "record_locator", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] } ] }, { "name": "coordinates", "type": "Edm.ComplexType", "fields": [ { "name": "system", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "layout_width", "type": "Edm.Double", "searchable": false, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "layout_height", "type": "Edm.Double", "searchable": false, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "points", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] } ] }, { "name": "languages", "type": "Collection(Edm.String)", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": false, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "page_number", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "orig_elements", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "links", "type": "Collection(Edm.String)", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": false, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "page_name", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "url", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "link_urls", "type": "Collection(Edm.String)", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": false, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "link_texts", "type": "Collection(Edm.String)", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": false, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "sent_from", "type": "Collection(Edm.String)", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": false, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "sent_to", "type": "Collection(Edm.String)", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": false, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "subject", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "section", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "header_footer_type", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "emphasized_text_contents", "type": "Collection(Edm.String)", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": false, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "emphasized_text_tags", "type": "Collection(Edm.String)", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": false, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "text_as_html", "type": "Edm.String", "searchable": true, "filterable": false, "retrievable": true, "stored": true, "sortable": false, "facetable": false, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "regex_metadata", "type": "Edm.String", "searchable": true, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] }, { "name": "detection_class_prob", "type": "Edm.Double", "searchable": false, "filterable": true, "retrievable": true, "stored": true, "sortable": true, "facetable": true, "key": false, "indexAnalyzer": null, "searchAnalyzer": null, "analyzer": null, "normalizer": null, "dimensions": null, "vectorSearchProfile": null, "vectorEncoding": null, "synonymMaps": [] } ] } ], "scoringProfiles": [], "corsOptions": null, "suggesters": [], "analyzers": [], "normalizers": [], "tokenizers": [], "tokenFilters": [], "charFilters": [], "encryptionKey": null, "similarity": { "@odata.type": "#Microsoft.Azure.Search.BM25Similarity", "k1": null, "b": null }, "semantic": null, "vectorSearch": { "algorithms": [ { "name": "embeddings-config", "kind": "hnsw", "hnswParameters": { "metric": "cosine", "m": 4, "efConstruction": 400, "efSearch": 500 }, "exhaustiveKnnParameters": null } ], "profiles": [ { "name": "embeddings-config-profile", "algorithm": "embeddings-config", "vectorizer": null, "compression": null } ], "vectorizers": [], "compressions": [] } }
See also:
To create or change an Azure AI Search destination connector, see the following examples.
Replace the preceding placeholders as follows:
<name>
(required) - A unique name for this connector.<endpoint>
(required) - The endpoint URL for Azure AI Search.<index>
(required) - The name of the index for Azure AI Search.<azure-ai-search-key>
(required) - The API key for Azure AI Search.
To change a connector, replace <connector-id>
with the source connector’s unique ID.
To get this ID, see List source connectors.