From 97bd4f210fe516bd4a7788bec1ced203f45de964 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 7 May 2025 14:53:31 +0930 Subject: [PATCH 1/2] connectd: don't run more than one reconnect timer at once. From grubles' logs: ``` 2025-01-06T15:30:31.449Z DEBUG lightningd: attempting connection to 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923 for additional gossip 2025-01-06T15:30:31.449Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Adding 0 addresses to important peer 2025-01-06T15:30:31.449Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Failed connected out: Unable to connect, no address known for peer 2025-01-06T15:30:31.449Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Will try reconnect in 300 seconds 2025-01-06T15:30:32.037Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Failed connected out: Unable to connect, no address known for peer 2025-01-06T15:30:32.037Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Will try reconnect in 300 seconds 2025-01-06T15:30:32.428Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Failed connected out: Unable to connect, no address known for peer 2025-01-06T15:30:32.428Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Will try reconnect in 300 seconds 2025-01-06T15:30:32.680Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Failed connected out: Unable to connect, no address known for peer 2025-01-06T15:30:32.681Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Will try reconnect in 300 seconds 2025-01-06T15:30:33.468Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Failed connected out: Unable to connect, no address known for peer 2025-01-06T15:30:33.469Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Will try reconnect in 300 seconds 2025-01-06T15:30:33.471Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Failed connected out: Unable to connect, no address known for peer 2025-01-06T15:30:33.471Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Will try reconnect in 300 seconds 2025-01-06T15:30:33.935Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Failed connected out: Unable to connect, no address known for peer 2025-01-06T15:30:33.935Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Will try reconnect in 300 seconds 2025-01-06T15:30:34.125Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Failed connected out: Unable to connect, no address known for peer 2025-01-06T15:30:34.125Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Will try reconnect in 300 seconds 2025-01-06T15:30:35.496Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Failed connected out: Unable to connect, no address known for peer 2025-01-06T15:30:35.497Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Will try reconnect in 300 seconds 2025-01-06T15:30:35.623Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Failed connected out: Unable to connect, no address known for peer 2025-01-06T15:30:35.623Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Will try reconnect in 300 seconds 2025-01-06T15:30:35.751Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Failed connected out: Unable to connect, no address known for peer 2025-01-06T15:30:35.751Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Will try reconnect in 300 seconds 2025-01-06T15:30:35.892Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Failed connected out: Unable to connect, no address known for peer 2025-01-06T15:30:35.892Z DEBUG 035ca2fe4793a5e789ce846062eb4834f573c060d9200ce77544a29b48a0aa5923-connectd: Will try reconnect in 300 seconds ``` We promised to wait 300 seconds! Signed-off-by: Rusty Russell --- connectd/connectd.c | 12 +++++++----- connectd/connectd.h | 3 +++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/connectd/connectd.c b/connectd/connectd.c index 2cd055a6bcf5..363694f79b1b 100644 --- a/connectd/connectd.c +++ b/connectd/connectd.c @@ -796,6 +796,7 @@ static void reconnect(struct important_id *imp) /* Do gossmap lookup to find any addresses from there, and append. */ append_gossmap_addresses(&addrs, imp->daemon, &imp->id); + imp->reconnect_timer = NULL; try_connect_peer(imp->daemon, &imp->id, take(addrs), false); } @@ -811,7 +812,7 @@ static void schedule_reconnect_if_important(struct daemon *daemon, return; /* Already on it? */ - if (find_connecting(daemon, id)) + if (find_connecting(daemon, id) || imp->reconnect_timer) return; /* --dev-no-reconnect? Don't reconnect. */ @@ -822,10 +823,10 @@ static void schedule_reconnect_if_important(struct daemon *daemon, imp->reconnect_secs); /* We fuzz the timer by up to 1 second, to avoid getting into * simultanous-reconnect deadlocks with peer. */ - notleak(new_reltimer(&daemon->timers, imp, - timerel_add(time_from_sec(imp->reconnect_secs), - time_from_usec(pseudorand(1000000))), - reconnect, imp)); + imp->reconnect_timer = new_reltimer(&daemon->timers, imp, + timerel_add(time_from_sec(imp->reconnect_secs), + time_from_usec(pseudorand(1000000))), + reconnect, imp); /* Back off next time if that fails */ imp->reconnect_secs *= 2; @@ -1972,6 +1973,7 @@ static void connect_to_peer(struct daemon *daemon, const u8 *msg) struct wireaddr_internal, addrs); imp->reconnect_secs = INITIAL_WAIT_SECONDS; + imp->reconnect_timer = NULL; tal_add_destructor(imp, destroy_important_id); important_id_htable_add(daemon->important_ids, imp); } else diff --git a/connectd/connectd.h b/connectd/connectd.h index a62f4475b7bf..2afbdfbfa027 100644 --- a/connectd/connectd.h +++ b/connectd/connectd.h @@ -149,7 +149,10 @@ struct important_id { struct node_id id; struct wireaddr_internal *addrs; + + /* Backoff timer (increases by 2 each time) */ size_t reconnect_secs; + struct oneshot *reconnect_timer; }; static const struct node_id *important_id_keyof(const struct important_id *imp) From 8ec77abb0efb525355dcdec248418a6737183f1f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 7 May 2025 15:02:31 +0930 Subject: [PATCH 2/2] connectd: remove DNS seed lookups. DNS seeds have been down/offline for a while, and this code (which blocks!) has been a source of trouble. We should probably use a canned set of "known nodes" if we want to bootstrap. Signed-off-by: Rusty Russell Changelog-Changed: Protocol: we no longer use DNS seeds for peer lookup fallbacks. Fixes: https://github.com/ElementsProject/lightning/issues/7913 --- connectd/connectd.c | 96 +++--------------------------------- connectd/connectd.h | 4 -- connectd/connectd_wire.csv | 1 - lightningd/connect_control.c | 4 +- lightningd/gossip_control.c | 1 - 5 files changed, 8 insertions(+), 98 deletions(-) diff --git a/connectd/connectd.c b/connectd/connectd.c index 363694f79b1b..69fe522a1e7c 100644 --- a/connectd/connectd.c +++ b/connectd/connectd.c @@ -78,52 +78,12 @@ static void try_connect_one_addr(struct connecting *connect); * timer to call a higher function again, so has to be pre-declared. */ static void try_connect_peer(struct daemon *daemon, const struct node_id *id, - struct wireaddr_internal *addrs TAKES, - bool dns_fallback); + struct wireaddr_internal *addrs TAKES); /* We track peers which are important, and try to reconnect (with backoff) */ static void schedule_reconnect_if_important(struct daemon *daemon, const struct node_id *id); -/*~ Some ISP resolvers will reply with a dummy IP to queries that would otherwise - * result in an NXDOMAIN reply. This just checks whether we have one such - * resolver upstream and remembers its reply so we can try to filter future - * dummies out. - */ -static bool broken_resolver(struct daemon *daemon) -{ - struct addrinfo *addrinfo; - struct addrinfo hints; - const char *hostname = "nxdomain-test.doesntexist"; - int err; - - /* If they told us to never do DNS queries, don't even do this one and - * also not if we just say that we don't */ - if (!daemon->use_dns || daemon->always_use_proxy) { - daemon->broken_resolver_response = NULL; - return false; - } - - memset(&hints, 0, sizeof(hints)); - hints.ai_family = AF_UNSPEC; - hints.ai_socktype = SOCK_STREAM; - hints.ai_protocol = 0; - hints.ai_flags = AI_ADDRCONFIG; - err = getaddrinfo(hostname, tal_fmt(tmpctx, "%d", 42), - &hints, &addrinfo); - - /*~ Note the use of tal_dup here: it is a memdup for tal, but it's - * type-aware so it's less error-prone. */ - if (err == 0) { - daemon->broken_resolver_response - = tal_dup(daemon, struct sockaddr, addrinfo->ai_addr); - freeaddrinfo(addrinfo); - } else - daemon->broken_resolver_response = NULL; - - return daemon->broken_resolver_response != NULL; -} - /*~ Here we see our first tal destructor: in this case the 'struct connect' * simply removes itself from the table of all 'connecting' structs. */ static void destroy_connecting(struct connecting *connect) @@ -797,7 +757,7 @@ static void reconnect(struct important_id *imp) append_gossmap_addresses(&addrs, imp->daemon, &imp->id); imp->reconnect_timer = NULL; - try_connect_peer(imp->daemon, &imp->id, take(addrs), false); + try_connect_peer(imp->daemon, &imp->id, take(addrs)); } static void schedule_reconnect_if_important(struct daemon *daemon, @@ -1656,11 +1616,6 @@ static void connect_init(struct daemon *daemon, const u8 *msg) } else daemon->proxyaddr = NULL; - if (broken_resolver(daemon)) { - status_debug("Broken DNS resolver detected, will check for " - "dummy replies"); - } - /* Figure out our addresses. */ daemon->listen_fds = setup_listeners(daemon, daemon, proposed_wireaddr, @@ -1798,39 +1753,6 @@ static const char **seednames(const tal_t *ctx, const struct node_id *id) return seednames; } -/*~ As a last resort, we do a DNS lookup to the lightning DNS seed to - * resolve a node name when they say to connect to it. This is synchronous, - * so connectd blocks, but it's not very common so we haven't fixed it. - * - * This "seed by DNS" approach is similar to what bitcoind uses, and in fact - * has the nice property that DNS is cached, and the seed only sees a request - * from the ISP, not directly from the user. */ -static void add_seed_addrs(struct wireaddr_internal **addrs, - const struct node_id *id, - struct sockaddr *broken_reply) -{ - struct wireaddr *new_addrs; - const char **hostnames = seednames(tmpctx, id); - - for (size_t i = 0; i < tal_count(hostnames); i++) { - status_peer_debug(id, "Resolving %s", hostnames[i]); - new_addrs = wireaddr_from_hostname(tmpctx, hostnames[i], chainparams_get_ln_port(chainparams), - NULL, broken_reply, NULL); - if (new_addrs) { - for (size_t j = 0; j < tal_count(new_addrs); j++) { - if (new_addrs[j].type == ADDR_TYPE_DNS) - continue; - status_peer_debug(id, "Resolved %s to %s", hostnames[i], - fmt_wireaddr(tmpctx, &new_addrs[j])); - append_addr(addrs, &new_addrs[j]); - } - /* Other seeds will likely have the same information. */ - return; - } else - status_peer_debug(id, "Could not resolve %s", hostnames[i]); - } -} - static bool addr_in(const struct wireaddr_internal *needle, const struct wireaddr_internal haystack[]) { @@ -1844,8 +1766,7 @@ static bool addr_in(const struct wireaddr_internal *needle, /*~ Try to connect to a single peer, given some addresses (in order) */ static void try_connect_peer(struct daemon *daemon, const struct node_id *id, - struct wireaddr_internal *addrs TAKES, - bool dns_fallback) + struct wireaddr_internal *addrs TAKES) { bool use_proxy = daemon->always_use_proxy; struct connecting *connect; @@ -1886,14 +1807,11 @@ static void try_connect_peer(struct daemon *daemon, chainparams_get_ln_port(chainparams)); tal_arr_expand(&addrs, unresolved); } - } else if (daemon->use_dns && dns_fallback) { - add_seed_addrs(&addrs, id, - daemon->broken_resolver_response); } } - /* Still no address? Fail immediately. Lightningd can still choose - * to retry; an address may get gossiped or appear on the DNS seed. */ + /* Still no address? Fail immediately. Important ones get + * retried; an address may get gossiped. */ if (tal_count(addrs) == 0) { connect_failed(daemon, id, CONNECT_NO_KNOWN_ADDRESS, @@ -1937,13 +1855,11 @@ static void connect_to_peer(struct daemon *daemon, const u8 *msg) { struct node_id id; struct wireaddr_internal *addrs; - bool dns_fallback; bool transient; struct important_id *imp; if (!fromwire_connectd_connect_to_peer(tmpctx, msg, &id, &addrs, - &dns_fallback, &transient)) master_badmsg(WIRE_CONNECTD_CONNECT_TO_PEER, msg); @@ -1983,7 +1899,7 @@ static void connect_to_peer(struct daemon *daemon, const u8 *msg) /* Do gossmap lookup to find any addresses from there, and append. */ append_gossmap_addresses(&addrs, daemon, &id); - try_connect_peer(daemon, &id, addrs, dns_fallback); + try_connect_peer(daemon, &id, addrs); } /* lightningd tells us a peer should be disconnected. */ diff --git a/connectd/connectd.h b/connectd/connectd.h index 2afbdfbfa027..d43179d966dd 100644 --- a/connectd/connectd.h +++ b/connectd/connectd.h @@ -296,10 +296,6 @@ struct daemon { * resort, but doing so leaks our address so can be disabled. */ bool use_dns; - /* The address that the broken response returns instead of - * NXDOMAIN. NULL if we have not detected a broken resolver. */ - struct sockaddr *broken_resolver_response; - /* File descriptors to listen on once we're activated. */ const struct listen_fd **listen_fds; diff --git a/connectd/connectd_wire.csv b/connectd/connectd_wire.csv index 6e7d986d34d0..96a88fa4a167 100644 --- a/connectd/connectd_wire.csv +++ b/connectd/connectd_wire.csv @@ -65,7 +65,6 @@ msgtype,connectd_connect_to_peer,2001 msgdata,connectd_connect_to_peer,id,node_id, msgdata,connectd_connect_to_peer,len,u32, msgdata,connectd_connect_to_peer,addrs,wireaddr_internal,len -msgdata,connectd_connect_to_peer,dns_fallback,bool, msgdata,connectd_connect_to_peer,transient,bool, # Connectd->master: connect failed. diff --git a/lightningd/connect_control.c b/lightningd/connect_control.c index 461f32ccdc8b..ceb41d3027d0 100644 --- a/lightningd/connect_control.c +++ b/lightningd/connect_control.c @@ -237,7 +237,7 @@ static struct command_result *json_connect(struct command *cmd, } subd_send_msg(cmd->ld->connectd, - take(towire_connectd_connect_to_peer(NULL, &id_addr.id, addr, true, true))); + take(towire_connectd_connect_to_peer(NULL, &id_addr.id, addr, true))); /* Leave this here for peer_connected, connect_failed or peer_disconnect_done. */ new_connect(cmd->ld, &id_addr.id, cmd); @@ -436,7 +436,7 @@ void connectd_connect_to_peer(struct lightningd *ld, } subd_send_msg(peer->ld->connectd, take(towire_connectd_connect_to_peer(NULL, &peer->id, - waddr, true, + waddr, !is_important))); } diff --git a/lightningd/gossip_control.c b/lightningd/gossip_control.c index e22bd1083f1e..2009d732ef8a 100644 --- a/lightningd/gossip_control.c +++ b/lightningd/gossip_control.c @@ -188,7 +188,6 @@ static void handle_connect_to_peer(struct subd *gossip, const u8 *msg) connectmsg = towire_connectd_connect_to_peer(NULL, &id, NULL, //addrhint, - false, //dns_fallback true); //transient subd_send_msg(gossip->ld->connectd, take(connectmsg)); }