From 140d2d02478d2f79ea57851c8b2309645f6725e5 Mon Sep 17 00:00:00 2001 From: SomberNight Date: Mon, 15 Jan 2024 17:31:39 +0000 Subject: [PATCH 1/4] lnpeer: fix timing issues in reestablish_channel, for dataloss case Assume Alice and Bob have a channel, and Alice is on an old state, but neither of them knows the latter yet. Timing scenarios: 1. Alice sends reest first, and Bob receives it before sending reest himself - old code: Bob realises Alice is behind, Bob will force-close, Bob won't send reest to Alice, so Alice does not learn she is behind - new code: Bob realises Alice is behind, Bob will force-close, Bob will still send reest to Alice, so Alice learns she is behind. 2. Bob sends reest first, and Alice receives it before sending reest herself - old code: Alice learns she is behind. Alice won't send reest to Bob, so Bob does not learn he is ahead, so Bob won't force-close. - new code: Alice learns she is behind. Alice will still send reest to Bob though with ctn=0 instead of actual. Bob learns he is ahead, so Bob will force close. 3. Alice and Bob both send reest, and then they both receive what the other sent - no change: Alice and Bob both learn Alice is behind. Bob will force-close. --- electrum/lnpeer.py | 70 +++++++++++++++++++++++------------ electrum/tests/test_lnpeer.py | 68 ++++++++++++++++++++++------------ 2 files changed, 90 insertions(+), 48 deletions(-) diff --git a/electrum/lnpeer.py b/electrum/lnpeer.py index 37b67579e..e3db405f7 100644 --- a/electrum/lnpeer.py +++ b/electrum/lnpeer.py @@ -1146,7 +1146,7 @@ class Peer(Logger): self.logger.info(f"tried to force-close channel {chan.get_id_for_log()} " f"but close option is not allowed. {chan.get_state()=!r}") - def on_channel_reestablish(self, chan, msg): + def on_channel_reestablish(self, chan: Channel, msg): their_next_local_ctn = msg["next_commitment_number"] their_oldest_unrevoked_remote_ctn = msg["next_revocation_number"] their_local_pcp = msg.get("my_current_per_commitment_point") @@ -1230,40 +1230,23 @@ class Peer(Logger): self.lnworker.save_channel(chan) chan.peer_state = PeerState.BAD # raise after we send channel_reestablish, so the remote can realize they are ahead - fut.set_exception(RemoteMisbehaving("remote ahead of us")) + # FIXME what if we have multiple chans with peer? timing... + fut.set_exception(GracefulDisconnect("remote ahead of us")) elif we_are_ahead: self.logger.warning(f"channel_reestablish ({chan.get_id_for_log()}): we are ahead of remote! trying to force-close.") self.schedule_force_closing(chan.channel_id) - fut.set_exception(RemoteMisbehaving("we are ahead of remote")) + # FIXME what if we have multiple chans with peer? timing... + fut.set_exception(GracefulDisconnect("we are ahead of remote")) else: # all good fut.set_result((we_must_resend_revoke_and_ack, their_next_local_ctn)) - async def reestablish_channel(self, chan: Channel): - await self.initialized + def _send_channel_reestablish(self, chan: Channel): + assert self.is_initialized() chan_id = chan.channel_id - if chan.should_request_force_close: - chan.set_state(ChannelState.REQUESTED_FCLOSE) - await self.request_force_close(chan_id) - chan.should_request_force_close = False - return - assert ChannelState.PREOPENING < chan.get_state() < ChannelState.FORCE_CLOSING - if chan.peer_state != PeerState.DISCONNECTED: - self.logger.info( - f'reestablish_channel was called but channel {chan.get_id_for_log()} ' - f'already in peer_state {chan.peer_state!r}') - return - chan.peer_state = PeerState.REESTABLISHING - util.trigger_callback('channel', self.lnworker.wallet, chan) # ctns - oldest_unrevoked_local_ctn = chan.get_oldest_unrevoked_ctn(LOCAL) - latest_local_ctn = chan.get_latest_ctn(LOCAL) next_local_ctn = chan.get_next_ctn(LOCAL) oldest_unrevoked_remote_ctn = chan.get_oldest_unrevoked_ctn(REMOTE) - latest_remote_ctn = chan.get_latest_ctn(REMOTE) - next_remote_ctn = chan.get_next_ctn(REMOTE) - # BOLT-02: "A node [...] upon disconnection [...] MUST reverse any uncommitted updates sent by the other side" - chan.hm.discard_unsigned_remote_updates() # send message assert chan.is_static_remotekey_enabled() latest_secret, latest_point = chan.get_secret_and_point(LOCAL, 0) @@ -1284,6 +1267,45 @@ class Peer(Logger): f'(next_local_ctn={next_local_ctn}, ' f'oldest_unrevoked_remote_ctn={oldest_unrevoked_remote_ctn})') + async def reestablish_channel(self, chan: Channel): + await self.initialized + chan_id = chan.channel_id + if chan.get_state() == ChannelState.WE_ARE_TOXIC: + # Depending on timing, the remote might not know we are behind. + # We should let them know, so that they force-close. + # We do "request force-close" with ctn=0, instead of leaking our actual ctns, + # to decrease the remote's confidence of actual data loss on our part. + await self.request_force_close(chan_id) + return + if chan.get_state() == ChannelState.FORCE_CLOSING: + # We likely got here because we found out that we are ahead (i.e. remote lost state). + # Depending on timing, the remote might not know they are behind. + # We should let them know: + self._send_channel_reestablish(chan) + return + if chan.should_request_force_close: + chan.set_state(ChannelState.REQUESTED_FCLOSE) + await self.request_force_close(chan_id) + chan.should_request_force_close = False + return + # if we get here, we will try to do a proper reestablish + if not (ChannelState.PREOPENING < chan.get_state() < ChannelState.FORCE_CLOSING): + raise Exception(f"unexpected {chan.get_state()=} for reestablish") + if chan.peer_state != PeerState.DISCONNECTED: + self.logger.info( + f'reestablish_channel was called but channel {chan.get_id_for_log()} ' + f'already in peer_state {chan.peer_state!r}') + return + chan.peer_state = PeerState.REESTABLISHING + util.trigger_callback('channel', self.lnworker.wallet, chan) + # ctns + oldest_unrevoked_local_ctn = chan.get_oldest_unrevoked_ctn(LOCAL) + next_local_ctn = chan.get_next_ctn(LOCAL) + oldest_unrevoked_remote_ctn = chan.get_oldest_unrevoked_ctn(REMOTE) + # BOLT-02: "A node [...] upon disconnection [...] MUST reverse any uncommitted updates sent by the other side" + chan.hm.discard_unsigned_remote_updates() + # send message + self._send_channel_reestablish(chan) # wait until we receive their channel_reestablish fut = self.channel_reestablish_msg[chan_id] await fut diff --git a/electrum/tests/test_lnpeer.py b/electrum/tests/test_lnpeer.py index fc25a8bb0..8d04474a1 100644 --- a/electrum/tests/test_lnpeer.py +++ b/electrum/tests/test_lnpeer.py @@ -566,30 +566,50 @@ class TestPeerDirect(TestPeer): await gath async def test_reestablish_with_old_state(self): - random_seed = os.urandom(32) - alice_channel, bob_channel = create_test_channels(random_seed=random_seed) - alice_channel_0, bob_channel_0 = create_test_channels(random_seed=random_seed) # these are identical - p1, p2, w1, w2, _q1, _q2 = self.prepare_peers(alice_channel, bob_channel) - lnaddr, pay_req = self.prepare_invoice(w2) - async def pay(): - result, log = await w1.pay_invoice(pay_req) - self.assertEqual(result, True) - gath.cancel() - gath = asyncio.gather(pay(), p1._message_loop(), p2._message_loop(), p1.htlc_switch(), p2.htlc_switch()) - with self.assertRaises(asyncio.CancelledError): - await gath - p1, p2, w1, w2, _q1, _q2 = self.prepare_peers(alice_channel_0, bob_channel) - for chan in (alice_channel_0, bob_channel): - chan.peer_state = PeerState.DISCONNECTED - async def reestablish(): - await asyncio.gather( - p1.reestablish_channel(alice_channel_0), - p2.reestablish_channel(bob_channel)) - gath = asyncio.gather(reestablish(), p1._message_loop(), p2._message_loop(), p1.htlc_switch(), p2.htlc_switch()) - with self.assertRaises(lnutil.RemoteMisbehaving): - await gath - self.assertEqual(alice_channel_0.peer_state, PeerState.BAD) - self.assertEqual(bob_channel._state, ChannelState.FORCE_CLOSING) + async def f(alice_slow: bool, bob_slow: bool): + random_seed = os.urandom(32) + alice_channel, bob_channel = create_test_channels(random_seed=random_seed) + alice_channel_0, bob_channel_0 = create_test_channels(random_seed=random_seed) # these are identical + p1, p2, w1, w2, _q1, _q2 = self.prepare_peers(alice_channel, bob_channel) + lnaddr, pay_req = self.prepare_invoice(w2) + async def pay(): + result, log = await w1.pay_invoice(pay_req) + self.assertEqual(result, True) + gath.cancel() + gath = asyncio.gather(pay(), p1._message_loop(), p2._message_loop(), p1.htlc_switch(), p2.htlc_switch()) + with self.assertRaises(asyncio.CancelledError): + await gath + p1, p2, w1, w2, _q1, _q2 = self.prepare_peers(alice_channel_0, bob_channel) + for chan in (alice_channel_0, bob_channel): + chan.peer_state = PeerState.DISCONNECTED + + async def alice_sends_reest(): + if alice_slow: await asyncio.sleep(0.05) + await p1.reestablish_channel(alice_channel_0) + async def bob_sends_reest(): + if bob_slow: await asyncio.sleep(0.05) + await p2.reestablish_channel(bob_channel) + + with self.assertRaises(GracefulDisconnect): + async with OldTaskGroup() as group: + await group.spawn(p1._message_loop()) + await group.spawn(p1.htlc_switch()) + await group.spawn(p2._message_loop()) + await group.spawn(p2.htlc_switch()) + await group.spawn(alice_sends_reest) + await group.spawn(bob_sends_reest) + self.assertEqual(alice_channel_0.peer_state, PeerState.BAD) + self.assertEqual(alice_channel_0._state, ChannelState.WE_ARE_TOXIC) + self.assertEqual(bob_channel._state, ChannelState.FORCE_CLOSING) + + with self.subTest(msg="both fast"): + # FIXME: we want to test the case where both Alice and Bob sends channel-reestablish before + # receiving what the other sent. This is not a reliable way to do that... + await f(alice_slow=False, bob_slow=False) + with self.subTest(msg="alice is slow"): + await f(alice_slow=True, bob_slow=False) + with self.subTest(msg="bob is slow"): + await f(alice_slow=False, bob_slow=True) @staticmethod def _send_fake_htlc(peer: Peer, chan: Channel) -> UpdateAddHtlc: From df58dd1f258f6dc8eee832358c2865fe33d27095 Mon Sep 17 00:00:00 2001 From: SomberNight Date: Mon, 15 Jan 2024 17:52:48 +0000 Subject: [PATCH 2/4] lnchannel.get_close_opts: allow REQUEST_REMOTE_FCLOSE if WE_ARE_TOXIC related https://github.com/spesmilo/electrum/issues/8770 --- electrum/gui/qt/channels_list.py | 11 +++++------ electrum/lnchannel.py | 4 ++++ electrum/lnpeer.py | 12 +++++++----- electrum/lnworker.py | 8 +++----- 4 files changed, 19 insertions(+), 16 deletions(-) diff --git a/electrum/gui/qt/channels_list.py b/electrum/gui/qt/channels_list.py index 1a85045cb..257256cf1 100644 --- a/electrum/gui/qt/channels_list.py +++ b/electrum/gui/qt/channels_list.py @@ -119,9 +119,6 @@ class ChannelsList(MyTreeView): def on_channel_closed(self, txid): self.main_window.show_error('Channel closed' + '\n' + txid) - def on_request_sent(self, b): - self.main_window.show_message(_('Request sent')) - def on_failure(self, exc_info): type_, e, tb = exc_info traceback.print_tb(tb) @@ -137,7 +134,7 @@ class ChannelsList(MyTreeView): on_success = self.on_channel_closed def task(): return self.network.run_from_another_thread(coro) - WaitingDialog(self, 'please wait..', task, on_success, self.on_failure) + WaitingDialog(self, _('Please wait...'), task, on_success, self.on_failure) def force_close(self, channel_id): self.save_backup = True @@ -161,7 +158,7 @@ class ChannelsList(MyTreeView): def task(): coro = self.lnworker.force_close_channel(channel_id) return self.network.run_from_another_thread(coro) - WaitingDialog(self, 'please wait..', task, self.on_channel_closed, self.on_failure) + WaitingDialog(self, _('Please wait...'), task, self.on_channel_closed, self.on_failure) def remove_channel(self, channel_id): if self.main_window.question(_('Are you sure you want to delete this channel? This will purge associated transactions from your wallet history.')): @@ -191,7 +188,9 @@ class ChannelsList(MyTreeView): def task(): coro = self.lnworker.request_force_close(channel_id) return self.network.run_from_another_thread(coro) - WaitingDialog(self, 'please wait..', task, self.on_request_sent, self.on_failure) + def on_done(b): + self.main_window.show_message(_('Request scheduled')) + WaitingDialog(self, _('Please wait...'), task, on_done, self.on_failure) def set_frozen(self, chan, *, for_sending, value): if not self.lnworker.uses_trampoline() or self.lnworker.is_trampoline_peer(chan.node_id): diff --git a/electrum/lnchannel.py b/electrum/lnchannel.py index 3827b22e7..ed8074d8b 100644 --- a/electrum/lnchannel.py +++ b/electrum/lnchannel.py @@ -1015,6 +1015,8 @@ class Channel(AbstractChannel): def should_try_to_reestablish_peer(self) -> bool: if self.peer_state != PeerState.DISCONNECTED: return False + if self.should_request_force_close: + return True return ChannelState.PREOPENING < self._state < ChannelState.CLOSING def get_funding_address(self): @@ -1629,6 +1631,8 @@ class Channel(AbstractChannel): if not self.has_unsettled_htlcs(): ret.append(ChanCloseOption.COOP_CLOSE) ret.append(ChanCloseOption.REQUEST_REMOTE_FCLOSE) + if self.get_state() == ChannelState.WE_ARE_TOXIC: + ret.append(ChanCloseOption.REQUEST_REMOTE_FCLOSE) if not self.is_closed() or self.get_state() == ChannelState.REQUESTED_FCLOSE: ret.append(ChanCloseOption.LOCAL_FCLOSE) assert not (self.get_state() == ChannelState.WE_ARE_TOXIC and ChanCloseOption.LOCAL_FCLOSE in ret), "local force-close unsafe if we are toxic" diff --git a/electrum/lnpeer.py b/electrum/lnpeer.py index e3db405f7..d7c3a7202 100644 --- a/electrum/lnpeer.py +++ b/electrum/lnpeer.py @@ -1113,6 +1113,7 @@ class Peer(Logger): async def request_force_close(self, channel_id: bytes): """Try to trigger the remote peer to force-close.""" await self.initialized + self.logger.info(f"trying to get remote peer to force-close chan {channel_id.hex()}") # First, we intentionally send a "channel_reestablish" msg with an old state. # Many nodes (but not all) automatically force-close when seeing this. latest_point = secret_to_pubkey(42) # we need a valid point (BOLT2) @@ -1270,6 +1271,12 @@ class Peer(Logger): async def reestablish_channel(self, chan: Channel): await self.initialized chan_id = chan.channel_id + if chan.should_request_force_close: + if chan.get_state() != ChannelState.WE_ARE_TOXIC: + chan.set_state(ChannelState.REQUESTED_FCLOSE) + await self.request_force_close(chan_id) + chan.should_request_force_close = False + return if chan.get_state() == ChannelState.WE_ARE_TOXIC: # Depending on timing, the remote might not know we are behind. # We should let them know, so that they force-close. @@ -1283,11 +1290,6 @@ class Peer(Logger): # We should let them know: self._send_channel_reestablish(chan) return - if chan.should_request_force_close: - chan.set_state(ChannelState.REQUESTED_FCLOSE) - await self.request_force_close(chan_id) - chan.should_request_force_close = False - return # if we get here, we will try to do a proper reestablish if not (ChannelState.PREOPENING < chan.get_state() < ChannelState.FORCE_CLOSING): raise Exception(f"unexpected {chan.get_state()=} for reestablish") diff --git a/electrum/lnworker.py b/electrum/lnworker.py index 17d9ccd6b..d0b034808 100644 --- a/electrum/lnworker.py +++ b/electrum/lnworker.py @@ -2904,9 +2904,8 @@ class LNWallet(LNWorker): if self._can_retry_addr(peer, urgent=True): await self._add_peer(peer.host, peer.port, peer.pubkey) for chan in self.channels.values(): - if chan.is_closed(): - continue # reestablish + # note: we delegate filtering out uninteresting chans to this: if not chan.should_try_to_reestablish_peer(): continue peer = self._peers.get(chan.node_id, None) @@ -2961,10 +2960,9 @@ class LNWallet(LNWorker): if channel_id in self.channels: chan = self.channels[channel_id] peer = self._peers.get(chan.node_id) - if not peer: - raise Exception('Peer not found') chan.should_request_force_close = True - peer.close_and_cleanup() + if peer: + peer.close_and_cleanup() # to force a reconnect elif connect_str: peer = await self.add_peer(connect_str) await peer.request_force_close(channel_id) From f808ec911af63d01cdbff1813eaa39a1f5bd6107 Mon Sep 17 00:00:00 2001 From: SomberNight Date: Mon, 15 Jan 2024 21:21:29 +0000 Subject: [PATCH 3/4] lnpeer.reest_chan: allow state transition WE_ARE_TOXIC->WE_ARE_TOXIC if the remote does not know we are behind, they will try to send us chan_reest ``` 38.52 | W | P/lnpeer.Peer.[LNWallet, 034cd7a09f-ed50cf95] | channel_reestablish (533x1x1): remote is ahead of us! They should force-close. Remote PCP: 03857eabd943eec820d56b94d2f162763294565627cc42c4a0db0e7c6b77da46be 38.52 | E | P/lnpeer.Peer.[LNWallet, 034cd7a09f-ed50cf95] | Exception in main_loop: Exception('Transition not allowed: WE_ARE_TOXIC -> WE_ARE_TOXIC') Traceback (most recent call last): File "/home/user/wspace/electrum/electrum/util.py", line 1148, in wrapper return await func(*args, **kwargs) File "/home/user/wspace/electrum/electrum/lnpeer.py", line 426, in wrapper_func return await func(self, *args, **kwargs) File "/home/user/wspace/electrum/electrum/lnpeer.py", line 440, in main_loop async with self.taskgroup as group: File "/home/user/.local/lib/python3.10/site-packages/aiorpcx/curio.py", line 297, in __aexit__ await self.join() File "/home/user/wspace/electrum/electrum/util.py", line 1319, in join task.result() File "/home/user/wspace/electrum/electrum/lnpeer.py", line 1231, in on_channel_reestablish chan.set_state(ChannelState.WE_ARE_TOXIC) File "/home/user/wspace/electrum/electrum/lnchannel.py", line 215, in set_state raise Exception(f"Transition not allowed: {old_state.name} -> {state.name}") Exception: Transition not allowed: WE_ARE_TOXIC -> WE_ARE_TOXIC ``` --- electrum/lnchannel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/electrum/lnchannel.py b/electrum/lnchannel.py index ed8074d8b..973e1c0f3 100644 --- a/electrum/lnchannel.py +++ b/electrum/lnchannel.py @@ -140,6 +140,7 @@ state_transitions = [ (cs.OPEN, cs.WE_ARE_TOXIC), (cs.SHUTDOWN, cs.WE_ARE_TOXIC), (cs.REQUESTED_FCLOSE, cs.WE_ARE_TOXIC), + (cs.WE_ARE_TOXIC, cs.WE_ARE_TOXIC), # (cs.FORCE_CLOSING, cs.FORCE_CLOSING), # allow multiple attempts (cs.FORCE_CLOSING, cs.CLOSED), From 2696e357c35c29eecfdd7b1b82f15d55e20bdf58 Mon Sep 17 00:00:00 2001 From: SomberNight Date: Mon, 15 Jan 2024 21:23:53 +0000 Subject: [PATCH 4/4] lnpeer: add comment to on_channel_reestablish re blocking In particular, lnd sends both chan_reest and then an error ("sync error"). It is critical we process the chan_reest and transition to WE_ARE_TOXIC before processing the error (which would trigger us to force-close). see spec https://github.com/lightning/bolts/blame/8a64c6a1cef979b3f0cecb00ba7a48c2d28b3588/02-peer-protocol.md#L1504-L1506 : > - upon reconnection: > [...] > - MUST transmit channel_reestablish for each channel. > - MUST wait to receive the other node's channel_reestablish message before sending any other messages for that channel. --- electrum/lnpeer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/electrum/lnpeer.py b/electrum/lnpeer.py index d7c3a7202..537eb292c 100644 --- a/electrum/lnpeer.py +++ b/electrum/lnpeer.py @@ -1148,6 +1148,10 @@ class Peer(Logger): f"but close option is not allowed. {chan.get_state()=!r}") def on_channel_reestablish(self, chan: Channel, msg): + # Note: it is critical for this message handler to block processing of further messages, + # until this msg is processed. If we are behind (lost state), and send chan_reest to the remote, + # when the remote realizes we are behind, they might send an "error" message - but the spec mandates + # they send chan_reest first. If we processed the error first, we might force-close and lose money! their_next_local_ctn = msg["next_commitment_number"] their_oldest_unrevoked_remote_ctn = msg["next_revocation_number"] their_local_pcp = msg.get("my_current_per_commitment_point")