@@ -100,19 +100,37 @@ public TimedAttemptSettings createNextAttempt(TimedAttemptSettings prevSettings)
100100 (long ) (settings .getRetryDelayMultiplier () * prevSettings .getRetryDelay ().toMillis ());
101101 newRetryDelay = Math .min (newRetryDelay , settings .getMaxRetryDelay ().toMillis ());
102102 }
103+ Duration randomDelay = Duration .ofMillis (nextRandomLong (newRetryDelay ));
103104
104105 // The rpc timeout is determined as follows:
105106 // attempt #0 - use the initialRpcTimeout;
106- // attempt #1+ - use the calculated value.
107+ // attempt #1+ - use the calculated value, or the time remaining in totalTimeout if the
108+ // calculated value would exceed the totalTimeout.
107109 long newRpcTimeout =
108110 (long ) (settings .getRpcTimeoutMultiplier () * prevSettings .getRpcTimeout ().toMillis ());
109111 newRpcTimeout = Math .min (newRpcTimeout , settings .getMaxRpcTimeout ().toMillis ());
110112
113+ // The totalTimeout could be zero if a callable is only using maxAttempts to limit retries.
114+ // If set, calculate time remaining in the totalTimeout since the start, taking into account the
115+ // next attempt's delay, in order to truncate the RPC timeout should it exceed the totalTimeout.
116+ if (!settings .getTotalTimeout ().isZero ()) {
117+ Duration timeElapsed =
118+ Duration .ofNanos (clock .nanoTime ())
119+ .minus (Duration .ofNanos (prevSettings .getFirstAttemptStartTimeNanos ()));
120+ Duration timeLeft = globalSettings .getTotalTimeout ().minus (timeElapsed ).minus (randomDelay );
121+
122+ // If timeLeft at this point is < 0, the shouldRetry logic will prevent
123+ // the attempt from being made as it would exceed the totalTimeout. A negative RPC timeout
124+ // will result in a deadline in the past, which should will always fail prior to making a
125+ // network call.
126+ newRpcTimeout = Math .min (newRpcTimeout , timeLeft .toMillis ());
127+ }
128+
111129 return TimedAttemptSettings .newBuilder ()
112130 .setGlobalSettings (prevSettings .getGlobalSettings ())
113131 .setRetryDelay (Duration .ofMillis (newRetryDelay ))
114132 .setRpcTimeout (Duration .ofMillis (newRpcTimeout ))
115- .setRandomizedRetryDelay (Duration . ofMillis ( nextRandomLong ( newRetryDelay )) )
133+ .setRandomizedRetryDelay (randomDelay )
116134 .setAttemptCount (prevSettings .getAttemptCount () + 1 )
117135 .setOverallAttemptCount (prevSettings .getOverallAttemptCount () + 1 )
118136 .setFirstAttemptStartTimeNanos (prevSettings .getFirstAttemptStartTimeNanos ())
@@ -144,7 +162,16 @@ public boolean shouldRetry(TimedAttemptSettings nextAttemptSettings) {
144162 - nextAttemptSettings .getFirstAttemptStartTimeNanos ()
145163 + nextAttemptSettings .getRandomizedRetryDelay ().toNanos ();
146164
147- // If totalTimeout limit is defined, check that it hasn't been crossed
165+ // If totalTimeout limit is defined, check that it hasn't been crossed.
166+ //
167+ // Note: if the potential time spent is exactly equal to the totalTimeout,
168+ // the attempt will still be allowed. This might not be desired, but if we
169+ // enforce it, it could have potentially negative side effects on LRO polling.
170+ // Specifically, if a polling retry attempt is denied, the LRO is canceled, and
171+ // if a polling retry attempt is denied because its delay would *reach* the
172+ // totalTimeout, the LRO would be canceled prematurely. The problem here is that
173+ // totalTimeout doubles as the polling threshold and also the time limit for an
174+ // operation to finish.
148175 if (totalTimeout > 0 && totalTimeSpentNanos > totalTimeout ) {
149176 return false ;
150177 }
0 commit comments