// Copyright (C) MongoDB, Inc. 2017-present. // // Licensed under the Apache License, Version 2.0 (the "License"); you may // not use this file except in compliance with the License. You may obtain // a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 package topology import ( "context" "errors" "fmt" "net" "sync" "sync/atomic" "time" "go.mongodb.org/mongo-driver/bson/primitive" "go.mongodb.org/mongo-driver/event" "go.mongodb.org/mongo-driver/mongo/address" "go.mongodb.org/mongo-driver/mongo/description" "go.mongodb.org/mongo-driver/x/mongo/driver" "go.mongodb.org/mongo-driver/x/mongo/driver/operation" ) const minHeartbeatInterval = 500 * time.Millisecond // Server state constants. const ( serverDisconnected int64 = iota serverDisconnecting serverConnected ) func serverStateString(state int64) string { switch state { case serverDisconnected: return "Disconnected" case serverDisconnecting: return "Disconnecting" case serverConnected: return "Connected" } return "" } var ( // ErrServerClosed occurs when an attempt to Get a connection is made after // the server has been closed. ErrServerClosed = errors.New("server is closed") // ErrServerConnected occurs when at attempt to Connect is made after a server // has already been connected. ErrServerConnected = errors.New("server is connected") errCheckCancelled = errors.New("server check cancelled") emptyDescription = description.NewDefaultServer("") ) // SelectedServer represents a specific server that was selected during server selection. // It contains the kind of the topology it was selected from. type SelectedServer struct { *Server Kind description.TopologyKind } // Description returns a description of the server as of the last heartbeat. func (ss *SelectedServer) Description() description.SelectedServer { sdesc := ss.Server.Description() return description.SelectedServer{ Server: sdesc, Kind: ss.Kind, } } // Server is a single server within a topology. type Server struct { // The following integer fields must be accessed using the atomic package and should be at the // beginning of the struct. // - atomic bug: https://pkg.go.dev/sync/atomic#pkg-note-BUG // - suggested layout: https://go101.org/article/memory-layout.html state int64 operationCount int64 cfg *serverConfig address address.Address // connection related fields pool *pool // goroutine management fields done chan struct{} checkNow chan struct{} disconnecting chan struct{} closewg sync.WaitGroup // description related fields desc atomic.Value // holds a description.Server updateTopologyCallback atomic.Value topologyID primitive.ObjectID // subscriber related fields subLock sync.Mutex subscribers map[uint64]chan description.Server currentSubscriberID uint64 subscriptionsClosed bool // heartbeat and cancellation related fields // globalCtx should be created in NewServer and cancelled in Disconnect to signal that the server is shutting down. // heartbeatCtx should be used for individual heartbeats and should be a child of globalCtx so that it will be // cancelled automatically during shutdown. heartbeatLock sync.Mutex conn *connection globalCtx context.Context globalCtxCancel context.CancelFunc heartbeatCtx context.Context heartbeatCtxCancel context.CancelFunc processErrorLock sync.Mutex rttMonitor *rttMonitor } // updateTopologyCallback is a callback used to create a server that should be called when the parent Topology instance // should be updated based on a new server description. The callback must return the server description that should be // stored by the server. type updateTopologyCallback func(description.Server) description.Server // ConnectServer creates a new Server and then initializes it using the // Connect method. func ConnectServer(addr address.Address, updateCallback updateTopologyCallback, topologyID primitive.ObjectID, opts ...ServerOption) (*Server, error) { srvr := NewServer(addr, topologyID, opts...) err := srvr.Connect(updateCallback) if err != nil { return nil, err } return srvr, nil } // NewServer creates a new server. The mongodb server at the address will be monitored // on an internal monitoring goroutine. func NewServer(addr address.Address, topologyID primitive.ObjectID, opts ...ServerOption) *Server { cfg := newServerConfig(opts...) globalCtx, globalCtxCancel := context.WithCancel(context.Background()) s := &Server{ state: serverDisconnected, cfg: cfg, address: addr, done: make(chan struct{}), checkNow: make(chan struct{}, 1), disconnecting: make(chan struct{}), topologyID: topologyID, subscribers: make(map[uint64]chan description.Server), globalCtx: globalCtx, globalCtxCancel: globalCtxCancel, } s.desc.Store(description.NewDefaultServer(addr)) rttCfg := &rttConfig{ interval: cfg.heartbeatInterval, minRTTWindow: 5 * time.Minute, createConnectionFn: s.createConnection, createOperationFn: s.createBaseOperation, } s.rttMonitor = newRTTMonitor(rttCfg) pc := poolConfig{ Address: addr, MinPoolSize: cfg.minConns, MaxPoolSize: cfg.maxConns, MaxConnecting: cfg.maxConnecting, MaxIdleTime: cfg.poolMaxIdleTime, MaintainInterval: cfg.poolMaintainInterval, PoolMonitor: cfg.poolMonitor, handshakeErrFn: s.ProcessHandshakeError, } connectionOpts := copyConnectionOpts(cfg.connectionOpts) s.pool = newPool(pc, connectionOpts...) s.publishServerOpeningEvent(s.address) return s } // Connect initializes the Server by starting background monitoring goroutines. // This method must be called before a Server can be used. func (s *Server) Connect(updateCallback updateTopologyCallback) error { if !atomic.CompareAndSwapInt64(&s.state, serverDisconnected, serverConnected) { return ErrServerConnected } desc := description.NewDefaultServer(s.address) if s.cfg.loadBalanced { // LBs automatically start off with kind LoadBalancer because there is no monitoring routine for state changes. desc.Kind = description.LoadBalancer } s.desc.Store(desc) s.updateTopologyCallback.Store(updateCallback) if !s.cfg.monitoringDisabled && !s.cfg.loadBalanced { s.rttMonitor.connect() s.closewg.Add(1) go s.update() } // The CMAP spec describes that pools should only be marked "ready" when the server description // is updated to something other than "Unknown". However, we maintain the previous Server // behavior here and immediately mark the pool as ready during Connect() to simplify and speed // up the Client startup behavior. The risk of marking a pool as ready proactively during // Connect() is that we could attempt to create connections to a server that was configured // erroneously until the first server check or checkOut() failure occurs, when the SDAM error // handler would transition the Server back to "Unknown" and set the pool to "paused". return s.pool.ready() } // Disconnect closes sockets to the server referenced by this Server. // Subscriptions to this Server will be closed. Disconnect will shutdown // any monitoring goroutines, closeConnection the idle connection pool, and will // wait until all the in use connections have been returned to the connection // pool and are closed before returning. If the context expires via // cancellation, deadline, or timeout before the in use connections have been // returned, the in use connections will be closed, resulting in the failure of // any in flight read or write operations. If this method returns with no // errors, all connections associated with this Server have been closed. func (s *Server) Disconnect(ctx context.Context) error { if !atomic.CompareAndSwapInt64(&s.state, serverConnected, serverDisconnecting) { return ErrServerClosed } s.updateTopologyCallback.Store((updateTopologyCallback)(nil)) // Cancel the global context so any new contexts created from it will be automatically cancelled. Close the done // channel so the update() routine will know that it can stop. Cancel any in-progress monitoring checks at the end. // The done channel is closed before cancelling the check so the update routine() will immediately detect that it // can stop rather than trying to create new connections until the read from done succeeds. s.globalCtxCancel() close(s.done) s.cancelCheck() s.rttMonitor.disconnect() s.pool.close(ctx) s.closewg.Wait() atomic.StoreInt64(&s.state, serverDisconnected) return nil } // Connection gets a connection to the server. func (s *Server) Connection(ctx context.Context) (driver.Connection, error) { if atomic.LoadInt64(&s.state) != serverConnected { return nil, ErrServerClosed } // Increment the operation count before calling checkOut to make sure that all connection // requests are included in the operation count, including those in the wait queue. If we got an // error instead of a connection, immediately decrement the operation count. atomic.AddInt64(&s.operationCount, 1) conn, err := s.pool.checkOut(ctx) if err != nil { atomic.AddInt64(&s.operationCount, -1) return nil, err } return &Connection{ connection: conn, cleanupServerFn: func() { // Decrement the operation count whenever the caller is done with the connection. Note // that cleanupServerFn() is not called while the connection is pinned to a cursor or // transaction, so the operation count is not decremented until the cursor is closed or // the transaction is committed or aborted. Use an int64 instead of a uint64 to mitigate // the impact of any possible bugs that could cause the uint64 to underflow, which would // make the server much less selectable. atomic.AddInt64(&s.operationCount, -1) }, }, nil } // ProcessHandshakeError implements SDAM error handling for errors that occur before a connection // finishes handshaking. func (s *Server) ProcessHandshakeError(err error, startingGenerationNumber uint64, serviceID *primitive.ObjectID) { // Ignore the error if the server is behind a load balancer but the service ID is unknown. This indicates that the // error happened when dialing the connection or during the MongoDB handshake, so we don't know the service ID to // use for clearing the pool. if err == nil || s.cfg.loadBalanced && serviceID == nil { return } // Ignore the error if the connection is stale. if startingGenerationNumber < s.pool.generation.getGeneration(serviceID) { return } wrappedConnErr := unwrapConnectionError(err) if wrappedConnErr == nil { return } // Must hold the processErrorLock while updating the server description and clearing the pool. // Not holding the lock leads to possible out-of-order processing of pool.clear() and // pool.ready() calls from concurrent server description updates. s.processErrorLock.Lock() defer s.processErrorLock.Unlock() // Since the only kind of ConnectionError we receive from pool.Get will be an initialization error, we should set // the description.Server appropriately. The description should not have a TopologyVersion because the staleness // checking logic above has already determined that this description is not stale. s.updateDescription(description.NewServerFromError(s.address, wrappedConnErr, nil)) s.pool.clear(err, serviceID) s.cancelCheck() } // Description returns a description of the server as of the last heartbeat. func (s *Server) Description() description.Server { return s.desc.Load().(description.Server) } // SelectedDescription returns a description.SelectedServer with a Kind of // Single. This can be used when performing tasks like monitoring a batch // of servers and you want to run one off commands against those servers. func (s *Server) SelectedDescription() description.SelectedServer { sdesc := s.Description() return description.SelectedServer{ Server: sdesc, Kind: description.Single, } } // Subscribe returns a ServerSubscription which has a channel on which all // updated server descriptions will be sent. The channel will have a buffer // size of one, and will be pre-populated with the current description. func (s *Server) Subscribe() (*ServerSubscription, error) { if atomic.LoadInt64(&s.state) != serverConnected { return nil, ErrSubscribeAfterClosed } ch := make(chan description.Server, 1) ch <- s.desc.Load().(description.Server) s.subLock.Lock() defer s.subLock.Unlock() if s.subscriptionsClosed { return nil, ErrSubscribeAfterClosed } id := s.currentSubscriberID s.subscribers[id] = ch s.currentSubscriberID++ ss := &ServerSubscription{ C: ch, s: s, id: id, } return ss, nil } // RequestImmediateCheck will cause the server to send a heartbeat immediately // instead of waiting for the heartbeat timeout. func (s *Server) RequestImmediateCheck() { select { case s.checkNow <- struct{}{}: default: } } // getWriteConcernErrorForProcessing extracts a driver.WriteConcernError from the provided error. This function returns // (error, true) if the error is a WriteConcernError and the falls under the requirements for SDAM error // handling and (nil, false) otherwise. func getWriteConcernErrorForProcessing(err error) (*driver.WriteConcernError, bool) { writeCmdErr, ok := err.(driver.WriteCommandError) if !ok { return nil, false } wcerr := writeCmdErr.WriteConcernError if wcerr != nil && (wcerr.NodeIsRecovering() || wcerr.NotPrimary()) { return wcerr, true } return nil, false } // ProcessError handles SDAM error handling and implements driver.ErrorProcessor. func (s *Server) ProcessError(err error, conn driver.Connection) driver.ProcessErrorResult { // ignore nil error if err == nil { return driver.NoChange } // Must hold the processErrorLock while updating the server description and clearing the pool. // Not holding the lock leads to possible out-of-order processing of pool.clear() and // pool.ready() calls from concurrent server description updates. s.processErrorLock.Lock() defer s.processErrorLock.Unlock() // ignore stale error if conn.Stale() { return driver.NoChange } // Invalidate server description if not primary or node recovering error occurs. // These errors can be reported as a command error or a write concern error. desc := conn.Description() if cerr, ok := err.(driver.Error); ok && (cerr.NodeIsRecovering() || cerr.NotPrimary()) { // ignore stale error if desc.TopologyVersion.CompareToIncoming(cerr.TopologyVersion) >= 0 { return driver.NoChange } // updates description to unknown s.updateDescription(description.NewServerFromError(s.address, err, cerr.TopologyVersion)) s.RequestImmediateCheck() res := driver.ServerMarkedUnknown // If the node is shutting down or is older than 4.2, we synchronously clear the pool if cerr.NodeIsShuttingDown() || desc.WireVersion == nil || desc.WireVersion.Max < 8 { res = driver.ConnectionPoolCleared s.pool.clear(err, desc.ServiceID) } return res } if wcerr, ok := getWriteConcernErrorForProcessing(err); ok { // ignore stale error if desc.TopologyVersion.CompareToIncoming(wcerr.TopologyVersion) >= 0 { return driver.NoChange } // updates description to unknown s.updateDescription(description.NewServerFromError(s.address, err, wcerr.TopologyVersion)) s.RequestImmediateCheck() res := driver.ServerMarkedUnknown // If the node is shutting down or is older than 4.2, we synchronously clear the pool if wcerr.NodeIsShuttingDown() || desc.WireVersion == nil || desc.WireVersion.Max < 8 { res = driver.ConnectionPoolCleared s.pool.clear(err, desc.ServiceID) } return res } wrappedConnErr := unwrapConnectionError(err) if wrappedConnErr == nil { return driver.NoChange } // Ignore transient timeout errors. if netErr, ok := wrappedConnErr.(net.Error); ok && netErr.Timeout() { return driver.NoChange } if wrappedConnErr == context.Canceled || wrappedConnErr == context.DeadlineExceeded { return driver.NoChange } // For a non-timeout network error, we clear the pool, set the description to Unknown, and cancel the in-progress // monitoring check. The check is cancelled last to avoid a post-cancellation reconnect racing with // updateDescription. s.updateDescription(description.NewServerFromError(s.address, err, nil)) s.pool.clear(err, desc.ServiceID) s.cancelCheck() return driver.ConnectionPoolCleared } // update handles performing heartbeats and updating any subscribers of the // newest description.Server retrieved. func (s *Server) update() { defer s.closewg.Done() heartbeatTicker := time.NewTicker(s.cfg.heartbeatInterval) rateLimiter := time.NewTicker(minHeartbeatInterval) defer heartbeatTicker.Stop() defer rateLimiter.Stop() checkNow := s.checkNow done := s.done var doneOnce bool defer func() { if r := recover(); r != nil { if doneOnce { return } // We keep this goroutine alive attempting to read from the done channel. <-done } }() closeServer := func() { doneOnce = true s.subLock.Lock() for id, c := range s.subscribers { close(c) delete(s.subscribers, id) } s.subscriptionsClosed = true s.subLock.Unlock() // We don't need to take s.heartbeatLock here because closeServer is called synchronously when the select checks // below detect that the server is being closed, so we can be sure that the connection isn't being used. if s.conn != nil { _ = s.conn.close() } } waitUntilNextCheck := func() { // Wait until heartbeatFrequency elapses, an application operation requests an immediate check, or the server // is disconnecting. select { case <-heartbeatTicker.C: case <-checkNow: case <-done: // Return because the next update iteration will check the done channel again and clean up. return } // Ensure we only return if minHeartbeatFrequency has elapsed or the server is disconnecting. select { case <-rateLimiter.C: case <-done: return } } timeoutCnt := 0 for { // Check if the server is disconnecting. Even if waitForNextCheck has already read from the done channel, we // can safely read from it again because Disconnect closes the channel. select { case <-done: closeServer() return default: } previousDescription := s.Description() // Perform the next check. desc, err := s.check() if err == errCheckCancelled { if atomic.LoadInt64(&s.state) != serverConnected { continue } // If the server is not disconnecting, the check was cancelled by an application operation after an error. // Wait before running the next check. waitUntilNextCheck() continue } if isShortcut := func() bool { // Must hold the processErrorLock while updating the server description and clearing the // pool. Not holding the lock leads to possible out-of-order processing of pool.clear() and // pool.ready() calls from concurrent server description updates. s.processErrorLock.Lock() defer s.processErrorLock.Unlock() s.updateDescription(desc) // Retry after the first timeout before clearing the pool in case of a FAAS pause as // described in GODRIVER-2577. if err := unwrapConnectionError(desc.LastError); err != nil && timeoutCnt < 1 { if err == context.Canceled || err == context.DeadlineExceeded { timeoutCnt++ // We want to immediately retry on timeout error. Continue to next loop. return true } if err, ok := err.(net.Error); ok && err.Timeout() { timeoutCnt++ // We want to immediately retry on timeout error. Continue to next loop. return true } } if err := desc.LastError; err != nil { // Clear the pool once the description has been updated to Unknown. Pass in a nil service ID to clear // because the monitoring routine only runs for non-load balanced deployments in which servers don't return // IDs. s.pool.clear(err, nil) } // We're either not handling a timeout error, or we just handled the 2nd consecutive // timeout error. In either case, reset the timeout count to 0 and return false to // continue the normal check process. timeoutCnt = 0 return false }(); isShortcut { continue } // If the server supports streaming or we're already streaming, we want to move to streaming the next response // without waiting. If the server has transitioned to Unknown from a network error, we want to do another // check without waiting in case it was a transient error and the server isn't actually down. serverSupportsStreaming := desc.Kind != description.Unknown && desc.TopologyVersion != nil connectionIsStreaming := s.conn != nil && s.conn.getCurrentlyStreaming() transitionedFromNetworkError := desc.LastError != nil && unwrapConnectionError(desc.LastError) != nil && previousDescription.Kind != description.Unknown if serverSupportsStreaming || connectionIsStreaming || transitionedFromNetworkError { continue } // The server either does not support the streamable protocol or is not in a healthy state, so we wait until // the next check. waitUntilNextCheck() } } // updateDescription handles updating the description on the Server, notifying // subscribers, and potentially draining the connection pool. The initial // parameter is used to determine if this is the first description from the // server. func (s *Server) updateDescription(desc description.Server) { if s.cfg.loadBalanced { // In load balanced mode, there are no updates from the monitoring routine. For errors encountered in pooled // connections, the server should not be marked Unknown to ensure that the LB remains selectable. return } defer func() { // ¯\_(ツ)_/¯ _ = recover() }() // Anytime we update the server description to something other than "unknown", set the pool to // "ready". Do this before updating the description so that connections can be checked out as // soon as the server is selectable. If the pool is already ready, this operation is a no-op. // Note that this behavior is roughly consistent with the current Go driver behavior (connects // to all servers, even non-data-bearing nodes) but deviates slightly from CMAP spec, which // specifies a more restricted set of server descriptions and topologies that should mark the // pool ready. We don't have access to the topology here, so prefer the current Go driver // behavior for simplicity. if desc.Kind != description.Unknown { _ = s.pool.ready() } // Use the updateTopologyCallback to update the parent Topology and get the description that should be stored. callback, ok := s.updateTopologyCallback.Load().(updateTopologyCallback) if ok && callback != nil { desc = callback(desc) } s.desc.Store(desc) s.subLock.Lock() for _, c := range s.subscribers { select { // drain the channel if it isn't empty case <-c: default: } c <- desc } s.subLock.Unlock() } // createConnection creates a new connection instance but does not call connect on it. The caller must call connect // before the connection can be used for network operations. func (s *Server) createConnection() *connection { opts := copyConnectionOpts(s.cfg.connectionOpts) opts = append(opts, WithConnectTimeout(func(time.Duration) time.Duration { return s.cfg.heartbeatTimeout }), WithReadTimeout(func(time.Duration) time.Duration { return s.cfg.heartbeatTimeout }), WithWriteTimeout(func(time.Duration) time.Duration { return s.cfg.heartbeatTimeout }), // We override whatever handshaker is currently attached to the options with a basic // one because need to make sure we don't do auth. WithHandshaker(func(h Handshaker) Handshaker { return operation.NewHello().AppName(s.cfg.appname).Compressors(s.cfg.compressionOpts). ServerAPI(s.cfg.serverAPI) }), // Override any monitors specified in options with nil to avoid monitoring heartbeats. WithMonitor(func(*event.CommandMonitor) *event.CommandMonitor { return nil }), ) return newConnection(s.address, opts...) } func copyConnectionOpts(opts []ConnectionOption) []ConnectionOption { optsCopy := make([]ConnectionOption, len(opts)) copy(optsCopy, opts) return optsCopy } func (s *Server) setupHeartbeatConnection() error { conn := s.createConnection() // Take the lock when assigning the context and connection because they're accessed by cancelCheck. s.heartbeatLock.Lock() if s.heartbeatCtxCancel != nil { // Ensure the previous context is cancelled to avoid a leak. s.heartbeatCtxCancel() } s.heartbeatCtx, s.heartbeatCtxCancel = context.WithCancel(s.globalCtx) s.conn = conn s.heartbeatLock.Unlock() return s.conn.connect(s.heartbeatCtx) } // cancelCheck cancels in-progress connection dials and reads. It does not set any fields on the server. func (s *Server) cancelCheck() { var conn *connection // Take heartbeatLock for mutual exclusion with the checks in the update function. s.heartbeatLock.Lock() if s.heartbeatCtx != nil { s.heartbeatCtxCancel() } conn = s.conn s.heartbeatLock.Unlock() if conn == nil { return } // If the connection exists, we need to wait for it to be connected because conn.connect() and // conn.close() cannot be called concurrently. If the connection wasn't successfully opened, its // state was set back to disconnected, so calling conn.close() will be a no-op. conn.closeConnectContext() conn.wait() _ = conn.close() } func (s *Server) checkWasCancelled() bool { return s.heartbeatCtx.Err() != nil } func (s *Server) createBaseOperation(conn driver.Connection) *operation.Hello { return operation. NewHello(). ClusterClock(s.cfg.clock). Deployment(driver.SingleConnectionDeployment{conn}). ServerAPI(s.cfg.serverAPI) } func (s *Server) check() (description.Server, error) { var descPtr *description.Server var err error var durationNanos int64 start := time.Now() if s.conn == nil || s.conn.closed() || s.checkWasCancelled() { // Create a new connection if this is the first check, the connection was closed after an error during the previous // check, or the previous check was cancelled. isNilConn := s.conn == nil if !isNilConn { s.publishServerHeartbeatStartedEvent(s.conn.ID(), false) } // Create a new connection and add it's handshake RTT as a sample. err = s.setupHeartbeatConnection() durationNanos = time.Since(start).Nanoseconds() if err == nil { // Use the description from the connection handshake as the value for this check. s.rttMonitor.addSample(s.conn.helloRTT) descPtr = &s.conn.desc if !isNilConn { s.publishServerHeartbeatSucceededEvent(s.conn.ID(), durationNanos, s.conn.desc, false) } } else { err = unwrapConnectionError(err) if !isNilConn { s.publishServerHeartbeatFailedEvent(s.conn.ID(), durationNanos, err, false) } } } else { // An existing connection is being used. Use the server description properties to execute the right heartbeat. // Wrap conn in a type that implements driver.StreamerConnection. heartbeatConn := initConnection{s.conn} baseOperation := s.createBaseOperation(heartbeatConn) previousDescription := s.Description() streamable := previousDescription.TopologyVersion != nil s.publishServerHeartbeatStartedEvent(s.conn.ID(), s.conn.getCurrentlyStreaming() || streamable) switch { case s.conn.getCurrentlyStreaming(): // The connection is already in a streaming state, so we stream the next response. err = baseOperation.StreamResponse(s.heartbeatCtx, heartbeatConn) case streamable: // The server supports the streamable protocol. Set the socket timeout to // connectTimeoutMS+heartbeatFrequencyMS and execute an awaitable hello request. Set conn.canStream so // the wire message will advertise streaming support to the server. // Calculation for maxAwaitTimeMS is taken from time.Duration.Milliseconds (added in Go 1.13). maxAwaitTimeMS := int64(s.cfg.heartbeatInterval) / 1e6 // If connectTimeoutMS=0, the socket timeout should be infinite. Otherwise, it is connectTimeoutMS + // heartbeatFrequencyMS to account for the fact that the query will block for heartbeatFrequencyMS // server-side. socketTimeout := s.cfg.heartbeatTimeout if socketTimeout != 0 { socketTimeout += s.cfg.heartbeatInterval } s.conn.setSocketTimeout(socketTimeout) baseOperation = baseOperation.TopologyVersion(previousDescription.TopologyVersion). MaxAwaitTimeMS(maxAwaitTimeMS) s.conn.setCanStream(true) err = baseOperation.Execute(s.heartbeatCtx) default: // The server doesn't support the awaitable protocol. Set the socket timeout to connectTimeoutMS and // execute a regular heartbeat without any additional parameters. s.conn.setSocketTimeout(s.cfg.heartbeatTimeout) err = baseOperation.Execute(s.heartbeatCtx) } durationNanos = time.Since(start).Nanoseconds() if err == nil { tempDesc := baseOperation.Result(s.address) descPtr = &tempDesc s.publishServerHeartbeatSucceededEvent(s.conn.ID(), durationNanos, tempDesc, s.conn.getCurrentlyStreaming() || streamable) } else { // Close the connection here rather than below so we ensure we're not closing a connection that wasn't // successfully created. if s.conn != nil { _ = s.conn.close() } s.publishServerHeartbeatFailedEvent(s.conn.ID(), durationNanos, err, s.conn.getCurrentlyStreaming() || streamable) } } if descPtr != nil { // The check was successful. Set the average RTT and the 90th percentile RTT and return. desc := *descPtr desc = desc.SetAverageRTT(s.rttMonitor.EWMA()) desc.HeartbeatInterval = s.cfg.heartbeatInterval return desc, nil } if s.checkWasCancelled() { // If the previous check was cancelled, we don't want to clear the pool. Return a sentinel error so the caller // will know that an actual error didn't occur. return emptyDescription, errCheckCancelled } // An error occurred. We reset the RTT monitor for all errors and return an Unknown description. The pool must also // be cleared, but only after the description has already been updated, so that is handled by the caller. topologyVersion := extractTopologyVersion(err) s.rttMonitor.reset() return description.NewServerFromError(s.address, err, topologyVersion), nil } func extractTopologyVersion(err error) *description.TopologyVersion { if ce, ok := err.(ConnectionError); ok { err = ce.Wrapped } switch converted := err.(type) { case driver.Error: return converted.TopologyVersion case driver.WriteCommandError: if converted.WriteConcernError != nil { return converted.WriteConcernError.TopologyVersion } } return nil } // RTTMonitor returns this server's round-trip-time monitor. func (s *Server) RTTMonitor() driver.RTTMonitor { return s.rttMonitor } // OperationCount returns the current number of in-progress operations for this server. func (s *Server) OperationCount() int64 { return atomic.LoadInt64(&s.operationCount) } // String implements the Stringer interface. func (s *Server) String() string { desc := s.Description() state := atomic.LoadInt64(&s.state) str := fmt.Sprintf("Addr: %s, Type: %s, State: %s", s.address, desc.Kind, serverStateString(state)) if len(desc.Tags) != 0 { str += fmt.Sprintf(", Tag sets: %s", desc.Tags) } if state == serverConnected { str += fmt.Sprintf(", Average RTT: %s, Min RTT: %s", desc.AverageRTT, s.RTTMonitor().Min()) } if desc.LastError != nil { str += fmt.Sprintf(", Last error: %s", desc.LastError) } return str } // ServerSubscription represents a subscription to the description.Server updates for // a specific server. type ServerSubscription struct { C <-chan description.Server s *Server id uint64 } // Unsubscribe unsubscribes this ServerSubscription from updates and closes the // subscription channel. func (ss *ServerSubscription) Unsubscribe() error { ss.s.subLock.Lock() defer ss.s.subLock.Unlock() if ss.s.subscriptionsClosed { return nil } ch, ok := ss.s.subscribers[ss.id] if !ok { return nil } close(ch) delete(ss.s.subscribers, ss.id) return nil } // publishes a ServerOpeningEvent to indicate the server is being initialized func (s *Server) publishServerOpeningEvent(addr address.Address) { if s == nil { return } serverOpening := &event.ServerOpeningEvent{ Address: addr, TopologyID: s.topologyID, } if s.cfg.serverMonitor != nil && s.cfg.serverMonitor.ServerOpening != nil { s.cfg.serverMonitor.ServerOpening(serverOpening) } } // publishes a ServerHeartbeatStartedEvent to indicate a hello command has started func (s *Server) publishServerHeartbeatStartedEvent(connectionID string, await bool) { serverHeartbeatStarted := &event.ServerHeartbeatStartedEvent{ ConnectionID: connectionID, Awaited: await, } if s != nil && s.cfg.serverMonitor != nil && s.cfg.serverMonitor.ServerHeartbeatStarted != nil { s.cfg.serverMonitor.ServerHeartbeatStarted(serverHeartbeatStarted) } } // publishes a ServerHeartbeatSucceededEvent to indicate hello has succeeded func (s *Server) publishServerHeartbeatSucceededEvent(connectionID string, durationNanos int64, desc description.Server, await bool) { serverHeartbeatSucceeded := &event.ServerHeartbeatSucceededEvent{ DurationNanos: durationNanos, Reply: desc, ConnectionID: connectionID, Awaited: await, } if s != nil && s.cfg.serverMonitor != nil && s.cfg.serverMonitor.ServerHeartbeatSucceeded != nil { s.cfg.serverMonitor.ServerHeartbeatSucceeded(serverHeartbeatSucceeded) } } // publishes a ServerHeartbeatFailedEvent to indicate hello has failed func (s *Server) publishServerHeartbeatFailedEvent(connectionID string, durationNanos int64, err error, await bool) { serverHeartbeatFailed := &event.ServerHeartbeatFailedEvent{ DurationNanos: durationNanos, Failure: err, ConnectionID: connectionID, Awaited: await, } if s != nil && s.cfg.serverMonitor != nil && s.cfg.serverMonitor.ServerHeartbeatFailed != nil { s.cfg.serverMonitor.ServerHeartbeatFailed(serverHeartbeatFailed) } } // unwrapConnectionError returns the connection error wrapped by err, or nil if err does not wrap a connection error. func unwrapConnectionError(err error) error { // This is essentially an implementation of errors.As to unwrap this error until we get a ConnectionError and then // return ConnectionError.Wrapped. connErr, ok := err.(ConnectionError) if ok { return connErr.Wrapped } driverErr, ok := err.(driver.Error) if !ok || !driverErr.NetworkError() { return nil } connErr, ok = driverErr.Wrapped.(ConnectionError) if ok { return connErr.Wrapped } return nil }