specification/_global/bulk/BulkRequest.ts (43 lines of code) (raw):
/*
* Licensed to Elasticsearch B.V. under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch B.V. licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import { SourceConfigParam } from '@global/search/_types/SourceFilter'
import { RequestBase } from '@_types/Base'
import {
Fields,
IndexName,
Refresh,
Routing,
WaitForActiveShards
} from '@_types/common'
import { Duration } from '@_types/Time'
import { OperationContainer, UpdateAction } from './types'
/**
* Bulk index or delete documents.
* Perform multiple `index`, `create`, `delete`, and `update` actions in a single request.
* This reduces overhead and can greatly increase indexing speed.
*
* If the Elasticsearch security features are enabled, you must have the following index privileges for the target data stream, index, or index alias:
*
* * To use the `create` action, you must have the `create_doc`, `create`, `index`, or `write` index privilege. Data streams support only the `create` action.
* * To use the `index` action, you must have the `create`, `index`, or `write` index privilege.
* * To use the `delete` action, you must have the `delete` or `write` index privilege.
* * To use the `update` action, you must have the `index` or `write` index privilege.
* * To automatically create a data stream or index with a bulk API request, you must have the `auto_configure`, `create_index`, or `manage` index privilege.
* * To make the result of a bulk operation visible to search using the `refresh` parameter, you must have the `maintenance` or `manage` index privilege.
*
* Automatic data stream creation requires a matching index template with data stream enabled.
*
* The actions are specified in the request body using a newline delimited JSON (NDJSON) structure:
*
* ```
* action_and_meta_data\n
* optional_source\n
* action_and_meta_data\n
* optional_source\n
* ....
* action_and_meta_data\n
* optional_source\n
* ```
*
* The `index` and `create` actions expect a source on the next line and have the same semantics as the `op_type` parameter in the standard index API.
* A `create` action fails if a document with the same ID already exists in the target
* An `index` action adds or replaces a document as necessary.
*
* NOTE: Data streams support only the `create` action.
* To update or delete a document in a data stream, you must target the backing index containing the document.
*
* An `update` action expects that the partial doc, upsert, and script and its options are specified on the next line.
*
* A `delete` action does not expect a source on the next line and has the same semantics as the standard delete API.
*
* NOTE: The final line of data must end with a newline character (`\n`).
* Each newline character may be preceded by a carriage return (`\r`).
* When sending NDJSON data to the `_bulk` endpoint, use a `Content-Type` header of `application/json` or `application/x-ndjson`.
* Because this format uses literal newline characters (`\n`) as delimiters, make sure that the JSON actions and sources are not pretty printed.
*
* If you provide a target in the request path, it is used for any actions that don't explicitly specify an `_index` argument.
*
* A note on the format: the idea here is to make processing as fast as possible.
* As some of the actions are redirected to other shards on other nodes, only `action_meta_data` is parsed on the receiving node side.
*
* Client libraries using this protocol should try and strive to do something similar on the client side, and reduce buffering as much as possible.
*
* There is no "correct" number of actions to perform in a single bulk request.
* Experiment with different settings to find the optimal size for your particular workload.
* Note that Elasticsearch limits the maximum size of a HTTP request to 100mb by default so clients must ensure that no request exceeds this size.
* It is not possible to index a single document that exceeds the size limit, so you must pre-process any such documents into smaller pieces before sending them to Elasticsearch.
* For instance, split documents into pages or chapters before indexing them, or store raw binary data in a system outside Elasticsearch and replace the raw data with a link to the external system in the documents that you send to Elasticsearch.
*
* **Client suppport for bulk requests**
*
* Some of the officially supported clients provide helpers to assist with bulk requests and reindexing:
*
* * Go: Check out `esutil.BulkIndexer`
* * Perl: Check out `Search::Elasticsearch::Client::5_0::Bulk` and `Search::Elasticsearch::Client::5_0::Scroll`
* * Python: Check out `elasticsearch.helpers.*`
* * JavaScript: Check out `client.helpers.*`
* * .NET: Check out `BulkAllObservable`
* * PHP: Check out bulk indexing.
*
* **Submitting bulk requests with cURL**
*
* If you're providing text file input to `curl`, you must use the `--data-binary` flag instead of plain `-d`.
* The latter doesn't preserve newlines. For example:
*
* ```
* $ cat requests
* { "index" : { "_index" : "test", "_id" : "1" } }
* { "field1" : "value1" }
* $ curl -s -H "Content-Type: application/x-ndjson" -XPOST localhost:9200/_bulk --data-binary "@requests"; echo
* {"took":7, "errors": false, "items":[{"index":{"_index":"test","_id":"1","_version":1,"result":"created","forced_refresh":false}}]}
* ```
*
* **Optimistic concurrency control**
*
* Each `index` and `delete` action within a bulk API call may include the `if_seq_no` and `if_primary_term` parameters in their respective action and meta data lines.
* The `if_seq_no` and `if_primary_term` parameters control how operations are run, based on the last modification to existing documents. See Optimistic concurrency control for more details.
*
* **Versioning**
*
* Each bulk item can include the version value using the `version` field.
* It automatically follows the behavior of the index or delete operation based on the `_version` mapping.
* It also support the `version_type`.
*
* **Routing**
*
* Each bulk item can include the routing value using the `routing` field.
* It automatically follows the behavior of the index or delete operation based on the `_routing` mapping.
*
* NOTE: Data streams do not support custom routing unless they were created with the `allow_custom_routing` setting enabled in the template.
*
* **Wait for active shards**
*
* When making bulk calls, you can set the `wait_for_active_shards` parameter to require a minimum number of shard copies to be active before starting to process the bulk request.
*
* **Refresh**
*
* Control when the changes made by this request are visible to search.
*
* NOTE: Only the shards that receive the bulk request will be affected by refresh.
* Imagine a `_bulk?refresh=wait_for` request with three documents in it that happen to be routed to different shards in an index with five shards.
* The request will only wait for those three shards to refresh.
* The other two shards that make up the index do not participate in the `_bulk` request at all.
* @rest_spec_name bulk
* @availability stack stability=stable
* @availability serverless stability=stable visibility=public
* @doc_id docs-bulk
* @doc_tag document
*
*/
export interface Request<TDocument, TPartialDocument> extends RequestBase {
urls: [
{
path: '/_bulk'
methods: ['POST', 'PUT']
},
{
path: '/{index}/_bulk'
methods: ['POST', 'PUT']
}
]
path_parts: {
/**
* The name of the data stream, index, or index alias to perform bulk actions on.
*/
index?: IndexName
}
query_parameters: {
/**
* True or false if to include the document source in the error message in case of parsing errors.
* @server_default true
*/
include_source_on_error?: boolean
/**
* If `true`, the response will include the ingest pipelines that were run for each index or create.
* @server_default false
*/
list_executed_pipelines?: boolean
/**
* The pipeline identifier to use to preprocess incoming documents.
* If the index has a default ingest pipeline specified, setting the value to `_none` turns off the default ingest pipeline for this request.
* If a final pipeline is configured, it will always run regardless of the value of this parameter.
*/
pipeline?: string
/**
* If `true`, Elasticsearch refreshes the affected shards to make this operation visible to search.
* If `wait_for`, wait for a refresh to make this operation visible to search.
* If `false`, do nothing with refreshes.
* Valid values: `true`, `false`, `wait_for`.
* @server_default false
*/
refresh?: Refresh
/**
* A custom value that is used to route operations to a specific shard.
*/
routing?: Routing
/**
* Indicates whether to return the `_source` field (`true` or `false`) or contains a list of fields to return.
*/
_source?: SourceConfigParam
/**
* A comma-separated list of source fields to exclude from the response.
* You can also use this parameter to exclude fields from the subset specified in `_source_includes` query parameter.
* If the `_source` parameter is `false`, this parameter is ignored.
*/
_source_excludes?: Fields
/**
* A comma-separated list of source fields to include in the response.
* If this parameter is specified, only these source fields are returned.
* You can exclude fields from this subset using the `_source_excludes` query parameter.
* If the `_source` parameter is `false`, this parameter is ignored.
*/
_source_includes?: Fields
/**
* The period each action waits for the following operations: automatic index creation, dynamic mapping updates, and waiting for active shards.
* The default is `1m` (one minute), which guarantees Elasticsearch waits for at least the timeout before failing.
* The actual wait time could be longer, particularly when multiple waits occur.
* @server_default 1m
*/
timeout?: Duration
/**
* The number of shard copies that must be active before proceeding with the operation.
* Set to `all` or any positive integer up to the total number of shards in the index (`number_of_replicas+1`).
* The default is `1`, which waits for each primary shard to be active.
* @server_default 1
*/
wait_for_active_shards?: WaitForActiveShards
/**
* If `true`, the request's actions must target an index alias.
* @server_default false
*/
require_alias?: boolean
/**
* If `true`, the request's actions must target a data stream (existing or to be created).
* @server_default false
*/
require_data_stream?: boolean
}
/**
* The request body contains a newline-delimited list of `create`, `delete`, `index`, and `update` actions and their associated source data.
* @codegen_name operations */
// This declaration captures action_and_meta_data (OperationContainer) and the two kinds of sources
// that can follow: an update action for update operations and anything for index or create operations.
// /!\ must be kept in sync with BulkMonitoringRequest
body: Array<
OperationContainer | UpdateAction<TDocument, TPartialDocument> | TDocument
>
}