Skip to content

Commit

Permalink
Delay possibility to reuse transaction id when query is failing becau… (
Browse files Browse the repository at this point in the history
#13446)

…se of timeout

Motivation:

We should try to make it as unlikely as possible that we will reuse a
transaction for a query while the remote peer still might send the
response for the previously timed out query. Failing to do so put us in
risk that we will not be able to map the response correctly.

This reduces the risk of the following scenario:

- Query is send to the remote nameeserver
- Query is failed as the remote nameserver did not respond in the
configured time
- New query is send which uses the same transaction id as the failed
one. While we use a random to generate these this can still happen as we
only have 16bits for the id
- nameserver sends back the response for the first query (that we
already failed as timed out)
- We lookup the query for the id and find the new one and try to
complete it with the response for the "old query".
- This fails as validation of hostname etc not matches.

Modifications:

If we fail a query because of a timeout or cancellation, delay the removal of the id from
the context map. This will make the id not-reusable until the removal
actually take place.

Result:

Much less likely to reuse an id while the remote nameserver might still
respond to the old query
  • Loading branch information
normanmaurer authored Jun 16, 2023
1 parent bf8e779 commit b878314
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1372,6 1372,11 @@ public void channelRead(ChannelHandlerContext ctx, Object msg) {
qCh, queryId, res.sender());
res.release();
return;
} else if (qCtx.isDone()) {
logger.debug("{} Received a DNS response for a query that was timed out or cancelled: UDP [{}: {}]",
qCh, queryId, res.sender());
res.release();
return;
}

// Check if the response was truncated and if we can fallback to TCP to retry.
Expand Down Expand Up @@ -1419,7 1424,11 @@ public void channelRead(ChannelHandlerContext ctx, Object msg) {
}

DnsQueryContext foundCtx = queryContextManager.get(res.sender(), queryId);
if (foundCtx == tcpCtx) {
if (foundCtx != null && foundCtx.isDone()) {
logger.debug("{} Received a DNS response for a query that was timed out or cancelled "
": TCP [{}: {}]", tcpCh, queryId, res.sender());
response.release();
} else if (foundCtx == tcpCtx) {
tcpCtx.finishSuccess(new AddressedEnvelopeAdapter(
(InetSocketAddress) ctx.channel().remoteAddress(),
(InetSocketAddress) ctx.channel().localAddress(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 32,26 @@
import io.netty.util.concurrent.FutureListener;
import io.netty.util.concurrent.GenericFutureListener;
import io.netty.util.concurrent.Promise;
import io.netty.util.internal.SystemPropertyUtil;
import io.netty.util.internal.logging.InternalLogger;
import io.netty.util.internal.logging.InternalLoggerFactory;

import java.net.InetSocketAddress;
import java.util.concurrent.CancellationException;
import java.util.concurrent.TimeUnit;

import static io.netty.util.internal.ObjectUtil.checkNotNull;

abstract class DnsQueryContext {

private static final InternalLogger logger = InternalLoggerFactory.getInstance(DnsQueryContext.class);
private static final long ID_REUSE_ON_TIMEOUT_DELAY_MILLIS;

static {
ID_REUSE_ON_TIMEOUT_DELAY_MILLIS =
SystemPropertyUtil.getLong("io.netty.resolver.dns.idReuseOnTimeoutDelayMillis", 10000);
logger.debug("-Dio.netty.resolver.dns.idReuseOnTimeoutDelayMillis: {}", ID_REUSE_ON_TIMEOUT_DELAY_MILLIS);
}

private final Future<? extends Channel> channelReadyFuture;
private final Channel channel;
Expand Down Expand Up @@ -101,6 110,15 @@ private static boolean hasOptRecord(DnsRecord[] additionals) {
return false;
}

/**
* Returns {@code true} if the query was completed already.
*
* @return {@code true} if done.
*/
final boolean isDone() {
return promise.isDone();
}

/**
* Returns the {@link DnsQuestion} that will be written as part of the {@link DnsQuery}.
*
Expand Down Expand Up @@ -149,11 167,22 @@ public void operationComplete(Future<AddressedEnvelope<DnsResponse, InetSocketAd
timeoutFuture.cancel(false);
}

// Remove the id from the manager as soon as the query completes. This may be because of success,
// failure or cancellation
DnsQueryContext self = queryContextManager.remove(nameServerAddr, id);

assert self == DnsQueryContext.this : "Removed DnsQueryContext is not the correct instance";
Throwable cause = future.cause();
if (cause instanceof DnsNameResolverTimeoutException || cause instanceof CancellationException) {
// This query was failed due a timeout or cancellation. Let's delay the removal of the id to reduce
// the risk of reusing the same id again while the remote nameserver might send the response after
// the timeout.
channel.eventLoop().schedule(new Runnable() {
@Override
public void run() {
removeFromContextManager(nameServerAddr);
}
}, ID_REUSE_ON_TIMEOUT_DELAY_MILLIS, TimeUnit.MILLISECONDS);
} else {
// Remove the id from the manager as soon as the query completes. This may be because of success,
// failure or cancellation
removeFromContextManager(nameServerAddr);
}
}
});
final DnsQuestion question = question();
Expand All @@ -179,6 208,12 @@ public void operationComplete(Future<AddressedEnvelope<DnsResponse, InetSocketAd
return sendQuery(nameServerAddr, query, queryTimeoutMillis, flush);
}

private void removeFromContextManager(InetSocketAddress nameServerAddr) {
DnsQueryContext self = queryContextManager.remove(nameServerAddr, id);

assert self == this : "Removed DnsQueryContext is not the correct instance";
}

private ChannelFuture sendQuery(final InetSocketAddress nameServerAddr, final DnsQuery query,
final long queryTimeoutMillis, final boolean flush) {
final ChannelPromise writePromise = channel.newPromise();
Expand Down

0 comments on commit b878314

Please sign in to comment.