lnpeer: fix timing issues in reestablish_channel, for dataloss case

Assume Alice and Bob have a channel, and Alice is on an old state, but neither of them knows the latter yet. Timing scenarios: 1. Alice sends reest first, and Bob receives it before sending reest himself - old code: Bob realises Alice is behind, Bob will force-close, Bob won't send reest to Alice, so Alice does not learn she is behind - new code: Bob realises Alice is behind, Bob will force-close, Bob will still send reest to Alice, so Alice learns she is behind. 2. Bob sends reest first, and Alice receives it before sending reest herself - old code: Alice learns she is behind. Alice won't send reest to Bob, so Bob does not learn he is ahead, so Bob won't force-close. - new code: Alice learns she is behind. Alice will still send reest to Bob though with ctn=0 instead of actual. Bob learns he is ahead, so Bob will force close. 3. Alice and Bob both send reest, and then they both receive what the other sent - no change: Alice and Bob both learn Alice is behind. Bob will force-close.
2 years ago · 140d2d0247
2 changed files with 90 additions and 48 deletions
--- a/electrum/lnpeer.py
+++ b/electrum/lnpeer.py
@ -1146,7 +1146,7 @@ class Peer(Logger):
            self.logger.info(f"tried to force-close channel {chan.get_id_for_log()} "
                             f"but close option is not allowed. {chan.get_state()=!r}")

-    def on_channel_reestablish(self, chan, msg):
+    def on_channel_reestablish(self, chan: Channel, msg):
        their_next_local_ctn = msg["next_commitment_number"]
        their_oldest_unrevoked_remote_ctn = msg["next_revocation_number"]
        their_local_pcp = msg.get("my_current_per_commitment_point")
@ -1230,40 +1230,23 @@ class Peer(Logger):
            self.lnworker.save_channel(chan)
            chan.peer_state = PeerState.BAD
            # raise after we send channel_reestablish, so the remote can realize they are ahead
-            fut.set_exception(RemoteMisbehaving("remote ahead of us"))
+            # FIXME what if we have multiple chans with peer? timing...
+            fut.set_exception(GracefulDisconnect("remote ahead of us"))
        elif we_are_ahead:
            self.logger.warning(f"channel_reestablish ({chan.get_id_for_log()}): we are ahead of remote! trying to force-close.")
            self.schedule_force_closing(chan.channel_id)
-            fut.set_exception(RemoteMisbehaving("we are ahead of remote"))
+            # FIXME what if we have multiple chans with peer? timing...
+            fut.set_exception(GracefulDisconnect("we are ahead of remote"))
        else:
            # all good
            fut.set_result((we_must_resend_revoke_and_ack, their_next_local_ctn))

-    async def reestablish_channel(self, chan: Channel):
-        await self.initialized
+    def _send_channel_reestablish(self, chan: Channel):
+        assert self.is_initialized()
        chan_id = chan.channel_id
-        if chan.should_request_force_close:
-            chan.set_state(ChannelState.REQUESTED_FCLOSE)
-            await self.request_force_close(chan_id)
-            chan.should_request_force_close = False
-            return
-        assert ChannelState.PREOPENING < chan.get_state() < ChannelState.FORCE_CLOSING
-        if chan.peer_state != PeerState.DISCONNECTED:
-            self.logger.info(
-                f'reestablish_channel was called but channel {chan.get_id_for_log()} '
-                f'already in peer_state {chan.peer_state!r}')
-            return
-        chan.peer_state = PeerState.REESTABLISHING
-        util.trigger_callback('channel', self.lnworker.wallet, chan)
        # ctns
-        oldest_unrevoked_local_ctn = chan.get_oldest_unrevoked_ctn(LOCAL)
-        latest_local_ctn = chan.get_latest_ctn(LOCAL)
        next_local_ctn = chan.get_next_ctn(LOCAL)
        oldest_unrevoked_remote_ctn = chan.get_oldest_unrevoked_ctn(REMOTE)
-        latest_remote_ctn = chan.get_latest_ctn(REMOTE)
-        next_remote_ctn = chan.get_next_ctn(REMOTE)
-        # BOLT-02: "A node [...] upon disconnection [...] MUST reverse any uncommitted updates sent by the other side"
-        chan.hm.discard_unsigned_remote_updates()
        # send message
        assert chan.is_static_remotekey_enabled()
        latest_secret, latest_point = chan.get_secret_and_point(LOCAL, 0)
@ -1284,6 +1267,45 @@ class Peer(Logger):
            f'(next_local_ctn={next_local_ctn}, '
            f'oldest_unrevoked_remote_ctn={oldest_unrevoked_remote_ctn})')

+    async def reestablish_channel(self, chan: Channel):
+        await self.initialized
+        chan_id = chan.channel_id
+        if chan.get_state() == ChannelState.WE_ARE_TOXIC:
+            # Depending on timing, the remote might not know we are behind.
+            # We should let them know, so that they force-close.
+            # We do "request force-close" with ctn=0, instead of leaking our actual ctns,
+            # to decrease the remote's confidence of actual data loss on our part.
+            await self.request_force_close(chan_id)
+            return
+        if chan.get_state() == ChannelState.FORCE_CLOSING:
+            # We likely got here because we found out that we are ahead (i.e. remote lost state).
+            # Depending on timing, the remote might not know they are behind.
+            # We should let them know:
+            self._send_channel_reestablish(chan)
+            return
+        if chan.should_request_force_close:
+            chan.set_state(ChannelState.REQUESTED_FCLOSE)
+            await self.request_force_close(chan_id)
+            chan.should_request_force_close = False
+            return
+        # if we get here, we will try to do a proper reestablish
+        if not (ChannelState.PREOPENING < chan.get_state() < ChannelState.FORCE_CLOSING):
+            raise Exception(f"unexpected {chan.get_state()=} for reestablish")
+        if chan.peer_state != PeerState.DISCONNECTED:
+            self.logger.info(
+                f'reestablish_channel was called but channel {chan.get_id_for_log()} '
+                f'already in peer_state {chan.peer_state!r}')
+            return
+        chan.peer_state = PeerState.REESTABLISHING
+        util.trigger_callback('channel', self.lnworker.wallet, chan)
+        # ctns
+        oldest_unrevoked_local_ctn = chan.get_oldest_unrevoked_ctn(LOCAL)
+        next_local_ctn = chan.get_next_ctn(LOCAL)
+        oldest_unrevoked_remote_ctn = chan.get_oldest_unrevoked_ctn(REMOTE)
+        # BOLT-02: "A node [...] upon disconnection [...] MUST reverse any uncommitted updates sent by the other side"
+        chan.hm.discard_unsigned_remote_updates()
+        # send message
+        self._send_channel_reestablish(chan)
        # wait until we receive their channel_reestablish
        fut = self.channel_reestablish_msg[chan_id]
        await fut
--- a/electrum/tests/test_lnpeer.py
+++ b/electrum/tests/test_lnpeer.py
@ -566,30 +566,50 @@ class TestPeerDirect(TestPeer):
            await gath

    async def test_reestablish_with_old_state(self):
-        random_seed = os.urandom(32)
-        alice_channel, bob_channel = create_test_channels(random_seed=random_seed)
-        alice_channel_0, bob_channel_0 = create_test_channels(random_seed=random_seed)  # these are identical
-        p1, p2, w1, w2, _q1, _q2 = self.prepare_peers(alice_channel, bob_channel)
-        lnaddr, pay_req = self.prepare_invoice(w2)
-        async def pay():
-            result, log = await w1.pay_invoice(pay_req)
-            self.assertEqual(result, True)
-            gath.cancel()
-        gath = asyncio.gather(pay(), p1._message_loop(), p2._message_loop(), p1.htlc_switch(), p2.htlc_switch())
-        with self.assertRaises(asyncio.CancelledError):
-            await gath
-        p1, p2, w1, w2, _q1, _q2 = self.prepare_peers(alice_channel_0, bob_channel)
-        for chan in (alice_channel_0, bob_channel):
-            chan.peer_state = PeerState.DISCONNECTED
-        async def reestablish():
-            await asyncio.gather(
-                p1.reestablish_channel(alice_channel_0),
-                p2.reestablish_channel(bob_channel))
-        gath = asyncio.gather(reestablish(), p1._message_loop(), p2._message_loop(), p1.htlc_switch(), p2.htlc_switch())
-        with self.assertRaises(lnutil.RemoteMisbehaving):
-            await gath
-        self.assertEqual(alice_channel_0.peer_state, PeerState.BAD)
-        self.assertEqual(bob_channel._state, ChannelState.FORCE_CLOSING)
+        async def f(alice_slow: bool, bob_slow: bool):
+            random_seed = os.urandom(32)
+            alice_channel, bob_channel = create_test_channels(random_seed=random_seed)
+            alice_channel_0, bob_channel_0 = create_test_channels(random_seed=random_seed)  # these are identical
+            p1, p2, w1, w2, _q1, _q2 = self.prepare_peers(alice_channel, bob_channel)
+            lnaddr, pay_req = self.prepare_invoice(w2)
+            async def pay():
+                result, log = await w1.pay_invoice(pay_req)
+                self.assertEqual(result, True)
+                gath.cancel()
+            gath = asyncio.gather(pay(), p1._message_loop(), p2._message_loop(), p1.htlc_switch(), p2.htlc_switch())
+            with self.assertRaises(asyncio.CancelledError):
+                await gath
+            p1, p2, w1, w2, _q1, _q2 = self.prepare_peers(alice_channel_0, bob_channel)
+            for chan in (alice_channel_0, bob_channel):
+                chan.peer_state = PeerState.DISCONNECTED
+
+            async def alice_sends_reest():
+                if alice_slow: await asyncio.sleep(0.05)
+                await p1.reestablish_channel(alice_channel_0)
+            async def bob_sends_reest():
+                if bob_slow: await asyncio.sleep(0.05)
+                await p2.reestablish_channel(bob_channel)
+
+            with self.assertRaises(GracefulDisconnect):
+                async with OldTaskGroup() as group:
+                    await group.spawn(p1._message_loop())
+                    await group.spawn(p1.htlc_switch())
+                    await group.spawn(p2._message_loop())
+                    await group.spawn(p2.htlc_switch())
+                    await group.spawn(alice_sends_reest)
+                    await group.spawn(bob_sends_reest)
+            self.assertEqual(alice_channel_0.peer_state, PeerState.BAD)
+            self.assertEqual(alice_channel_0._state, ChannelState.WE_ARE_TOXIC)
+            self.assertEqual(bob_channel._state, ChannelState.FORCE_CLOSING)
+
+        with self.subTest(msg="both fast"):
+            # FIXME: we want to test the case where both Alice and Bob sends channel-reestablish before
+            #        receiving what the other sent. This is not a reliable way to do that...
+            await f(alice_slow=False, bob_slow=False)
+        with self.subTest(msg="alice is slow"):
+            await f(alice_slow=True, bob_slow=False)
+        with self.subTest(msg="bob is slow"):
+            await f(alice_slow=False, bob_slow=True)

    @staticmethod
    def _send_fake_htlc(peer: Peer, chan: Channel) -> UpdateAddHtlc: