pandaamit91
diff --git a/‎elasticsearch/helpers/__init__.py‎
Lines changed: 92 additions & 31 deletions b/‎elasticsearch/helpers/__init__.py‎
Lines changed: 92 additions & 31 deletions
diff --git a/‎setup.py‎
Lines changed: 1 addition & 1 deletion b/‎setup.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test_elasticsearch/test_server/test_helpers.py‎
Lines changed: 45 additions & 5 deletions b/‎test_elasticsearch/test_server/test_helpers.py‎
Lines changed: 45 additions & 5 deletions
@@ -2,6 +2,7 @@
 
 import logging
 from operator import methodcaller
+import time
 
 from ..exceptions import ElasticsearchException, TransportError
 from ..compat import map, string_types, Queue
@@ -52,9 +53,10 @@ def _chunk_actions(actions, chunk_size, max_chunk_bytes, serializer):
  Split actions into chunks by number or size, serialize them into strings in
  the process.
  """
- bulk_actions = []
+ bulk_actions, bulk_data = [], []
  size, action_count = 0, 0
  for action, data in actions:
+ raw_data, raw_action = data, action
  action = serializer.dumps(action)
  cur_size = len(action) + 1
 
@@ -64,20 +66,24 @@ def _chunk_actions(actions, chunk_size, max_chunk_bytes, serializer):
 
  # full chunk, send it and start a new one
  if bulk_actions and (size + cur_size > max_chunk_bytes or action_count == chunk_size):
- yield bulk_actions
- bulk_actions = []
+ yield bulk_data, bulk_actions
+ bulk_actions, bulk_data = [], []
  size, action_count = 0, 0
 
  bulk_actions.append(action)
  if data is not None:
  bulk_actions.append(data)
+ bulk_data.append((raw_action, raw_data))
+ else:
+ bulk_data.append((raw_action, ))
+
  size += cur_size
  action_count += 1
 
  if bulk_actions:
- yield bulk_actions
+ yield bulk_data, bulk_actions
 
-def _process_bulk_chunk(client, bulk_actions, raise_on_exception=True, raise_on_error=True, **kwargs):
+def _process_bulk_chunk(client, bulk_actions, bulk_data, raise_on_exception=True, raise_on_error=True, **kwargs):
  """
  Send a bulk request to elasticsearch and process the output.
  """
@@ -96,22 +102,14 @@ def _process_bulk_chunk(client, bulk_actions, raise_on_exception=True, raise_on_
  err_message = str(e)
  exc_errors = []
 
- # deserialize the data back, thisis expensive but only run on
- # errors if raise_on_exception is false, so shouldn't be a real
- # issue
- bulk_data = map(client.transport.serializer.loads, bulk_actions)
- while True:
- try:
- # collect all the information about failed actions
- action = next(bulk_data)
- op_type, action = action.popitem()
- info = {"error": err_message, "status": e.status_code, "exception": e}
- if op_type != 'delete':
- info['data'] = next(bulk_data)
- info.update(action)
- exc_errors.append({op_type: info})
- except StopIteration:
- break
+ for data in bulk_data:
+ # collect all the information about failed actions
+ op_type, action = data[0].copy().popitem()
+ info = {"error": err_message, "status": e.status_code, "exception": e}
+ if op_type != 'delete':
+ info['data'] = data[1]
+ info.update(action)
+ exc_errors.append({op_type: info})
 
  # emulate standard behavior for failed actions
  if raise_on_error:
@@ -122,9 +120,12 @@ def _process_bulk_chunk(client, bulk_actions, raise_on_exception=True, raise_on_
  return
 
  # go through request-reponse pairs and detect failures
- for op_type, item in map(methodcaller('popitem'), resp['items']):
+ for data, (op_type, item) in zip(bulk_data, map(methodcaller('popitem'), resp['items'])):
  ok = 200 <= item.get('status', 500) < 300
  if not ok and raise_on_error:
+ # include original document source
+ if len(data) > 1:
+ item['data'] = data[1]
  errors.append({op_type: item})
 
  if ok or not errors:
@@ -136,15 +137,22 @@ def _process_bulk_chunk(client, bulk_actions, raise_on_exception=True, raise_on_
  raise BulkIndexError('%i document(s) failed to index.' % len(errors), errors)
 
 def streaming_bulk(client, actions, chunk_size=500, max_chunk_bytes=100 * 1024 * 1024,
- raise_on_error=True, expand_action_callback=expand_action,
- raise_on_exception=True, **kwargs):
+ raise_on_error=True, expand_action_callback=expand_action,
+ raise_on_exception=True, max_retries=0, initial_backoff=2,
+ max_backoff=600, yield_ok=True, **kwargs):
+
  """
  Streaming bulk consumes actions from the iterable passed in and yields
  results per action. For non-streaming usecases use
  :func:`~elasticsearch.helpers.bulk` which is a wrapper around streaming
  bulk that returns summary information about the bulk operation once the
  entire input is consumed and sent.
 
+ If you specify ``max_retries`` it will also retry any documents that were
+ rejected with a ``429`` status code. To do this it will wait (**by calling
+ time.sleep which will block**) for ``initial_backoff`` seconds and then,
+ every subsequent rejection for the same chunk, for double the time every
+ time up to ``max_backoff`` seconds.
 
  :arg client: instance of :class:`~elasticsearch.Elasticsearch` to use
  :arg actions: iterable containing the actions to be executed
@@ -157,12 +165,59 @@ def streaming_bulk(client, actions, chunk_size=500, max_chunk_bytes=100 * 1024 *
  :arg expand_action_callback: callback executed on each action passed in,
  should return a tuple containing the action line and the data line
  (`None` if data line should be omitted).
+ :arg max_retries: maximum number of times a document will be retried when
+ ``429`` is received, set to 0 (default) for no retires on ``429``
+ :arg initial_backoff: number of seconds we should wait before the first
+ retry. Any subsequent retries will be powers of ``inittial_backoff *
+ 2**retry_number``
+ :arg max_backoff: maximum number of seconds a retry will wait
+ :arg yield_ok: if set to False will skip successful documents in the output
  """
  actions = map(expand_action_callback, actions)
 
- for bulk_actions in _chunk_actions(actions, chunk_size, max_chunk_bytes, client.transport.serializer):
- for result in _process_bulk_chunk(client, bulk_actions, raise_on_exception, raise_on_error, **kwargs):
- yield result
+ for bulk_data, bulk_actions in _chunk_actions(actions, chunk_size,
+ max_chunk_bytes,
+ client.transport.serializer):
+
+ for attempt in range(max_retries + 1):
+ to_retry, to_retry_data = [], []
+ if attempt:
+ time.sleep(min(max_backoff, initial_backoff * 2**(attempt-1)))
+
+ try:
+ for data, (ok, info) in zip(
+ bulk_data,
+ _process_bulk_chunk(client, bulk_actions, bulk_data,
+ raise_on_exception,
+ raise_on_error, **kwargs)
+ ):
+
+ if not ok:
+ action, info = info.popitem()
+ # retry if retries enabled, we get 429, and we are not
+ # in the last attempt
+ if max_retries \
+ and info['status'] == 429 \
+ and (attempt+1) <= max_retries:
+ # _process_bulk_chunk expects strings so we need to
+ # re-serialize the data
+ to_retry.extend(map(client.transport.serializer.dumps, data))
+ to_retry_data.append(data)
+ else:
+ yield ok, {action: info}
+ elif yield_ok:
+ yield ok, info
+
+ except TransportError as e:
+ # suppress 429 errors since we will retry them
+ if not max_retries or e.status_code != 429:
+ raise
+ else:
+ if not to_retry:
+ break
+ # retry only subset of documents that didn't succeed
+ bulk_actions, bulk_data = to_retry, to_retry_data
+
 
 def bulk(client, actions, stats_only=False, **kwargs):
  """
@@ -175,9 +230,12 @@ def bulk(client, actions, stats_only=False, **kwargs):
  options like ``stats_only`` only apply when ``raise_on_error`` is set to
  ``False``.
 
+ When errors are being collected original document data is included in the
+ error dictionary which can lead to an extra high memory usage. If you need
+ to process a lot of data and want to ignore/collect errors please consider
+ using the :func:`~elasticsearch.helpers.streaming_bulk` helper which will
+ just return the errors and not store them in memory.
 
- See :func:`~elasticsearch.helpers.streaming_bulk` for more accepted
- parameters
 
  :arg client: instance of :class:`~elasticsearch.Elasticsearch` to use
  :arg actions: iterator containing the actions
@@ -186,13 +244,16 @@ def bulk(client, actions, stats_only=False, **kwargs):
 
  Any additional keyword arguments will be passed to
  :func:`~elasticsearch.helpers.streaming_bulk` which is used to execute
- the operation.
+ the operation, see :func:`~elasticsearch.helpers.streaming_bulk` for more
+ accepted parameters.
  """
  success, failed = 0, 0
 
  # list of errors to be collected is not stats_only
  errors = []
 
+ # make streaming_bulk yield successful results so we can count them
+ kwargs['yield_ok'] = True
  for ok, item in streaming_bulk(client, actions, **kwargs):
  # go through request-reponse pairs and detect failures
  if not ok:
@@ -241,7 +302,7 @@ def _setup_queues(self):
 
  try:
  for result in pool.imap(
- lambda chunk: list(_process_bulk_chunk(client, chunk, **kwargs)),
+ lambda bulk_chunk: list(_process_bulk_chunk(client, bulk_chunk[1], bulk_chunk[0], **kwargs)),
  _chunk_actions(actions, chunk_size, max_chunk_bytes, client.transport.serializer)
  ):
  for item in result:
 
@@ -12,7 +12,7 @@
 f.close()
 
 install_requires = [
- 'urllib3>=1.8, <2.0',
+ 'urllib3<1.22,>=1.21.1',
 ]
 tests_require = [
  'requests>=2.0.0, <3.0.0',
 
@@ -5,16 +5,17 @@
 
 
 class FailingBulkClient(object):
- def __init__(self, client, fail_at=1):
+ def __init__(self, client, fail_at=(2, ), fail_with=TransportError(599, "Error!", {})):
  self.client = client
- self._called = -1
+ self._called = 0
  self._fail_at = fail_at
  self.transport = client.transport
+ self._fail_with = fail_with
 
  def bulk(self, *args, **kwargs):
  self._called += 1
- if self._called == self._fail_at:
- raise TransportError(599, "Error!", {})
+ if self._called in self._fail_at:
+ raise self._fail_with
  return self.client.bulk(*args, **kwargs)
 
 class TestStreamingBulk(ElasticsearchTestCase):
@@ -87,7 +88,6 @@ def test_transport_error_can_becaught(self):
  '_index': 'i',
  '_type': 't',
  '_id': 45,
-
  'data': {'f': 'v'},
  'error': "TransportError(599, 'Error!')",
  'status': 599
@@ -96,6 +96,46 @@ def test_transport_error_can_becaught(self):
  results[1][1]
  )
 
+ def test_rejected_documents_are_retried(self):
+ failing_client = FailingBulkClient(self.client, fail_with=TransportError(429, 'Rejected!', {}))
+ docs = [
+ {'_index': 'i', '_type': 't', '_id': 47, 'f': 'v'},
+ {'_index': 'i', '_type': 't', '_id': 45, 'f': 'v'},
+ {'_index': 'i', '_type': 't', '_id': 42, 'f': 'v'},
+ ]
+ results = list(helpers.streaming_bulk(failing_client, docs,
+ raise_on_exception=False,
+ raise_on_error=False,
+ chunk_size=1, max_retries=1,
+ initial_backoff=0))
+ self.assertEquals(3, len(results))
+ self.assertEquals([True, True, True], [r[0] for r in results])
+ self.client.indices.refresh(index='i')
+ res = self.client.search(index='i')
+ self.assertEquals(3, res['hits']['total'])
+ self.assertEquals(4, failing_client._called)
+
+ def test_rejected_documents_are_retried_at_most_max_retries_times(self):
+ failing_client = FailingBulkClient(self.client, fail_at=(1, 2, ),
+ fail_with=TransportError(429, 'Rejected!', {}))
+
+ docs = [
+ {'_index': 'i', '_type': 't', '_id': 47, 'f': 'v'},
+ {'_index': 'i', '_type': 't', '_id': 45, 'f': 'v'},
+ {'_index': 'i', '_type': 't', '_id': 42, 'f': 'v'},
+ ]
+ results = list(helpers.streaming_bulk(failing_client, docs,
+ raise_on_exception=False,
+ raise_on_error=False,
+ chunk_size=1, max_retries=1,
+ initial_backoff=0))
+ self.assertEquals(3, len(results))
+ self.assertEquals([False, True, True], [r[0] for r in results])
+ self.client.indices.refresh(index='i')
+ res = self.client.search(index='i')
+ self.assertEquals(2, res['hits']['total'])
+ self.assertEquals(4, failing_client._called)
+
 
 class TestBulk(ElasticsearchTestCase):
  def test_bulk_works_with_single_item(self):
Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@`
`12`	`12`	`f.close()`
`13`	`13`
`14`	`14`	`install_requires = [`
`15`		`- 'urllib3>=1.8, <2.0',`
	`15`	`+ 'urllib3<1.22,>=1.21.1',`
`16`	`16`	`]`
`17`	`17`	`tests_require = [`
`18`	`18`	`'requests>=2.0.0, <3.0.0',`