Merge pull request #1104 from ecsv/batadv-for-24.10

openwrt-24.10: batman-adv: Merge bugfixes from 2025.0
This commit is contained in:
Simon Wunderlich
2025-02-08 13:25:27 +01:00
committed by GitHub
4 changed files with 438 additions and 1 deletions

View File

@@ -4,7 +4,7 @@ include $(TOPDIR)/rules.mk
PKG_NAME:=batman-adv
PKG_VERSION:=2024.3
PKG_RELEASE:=3
PKG_RELEASE:=4
PKG_SOURCE:=$(PKG_NAME)-$(PKG_VERSION).tar.gz
PKG_SOURCE_URL:=https://downloads.open-mesh.org/batman/releases/batman-adv-$(PKG_VERSION)

View File

@@ -0,0 +1,71 @@
From: Andy Strohman <andrew@andrewstrohman.com>
Date: Thu, 9 Jan 2025 02:27:56 +0000
Subject: batman-adv: fix panic during interface removal
Reference counting is used to ensure that
batadv_hardif_neigh_node and batadv_hard_iface
are not freed before/during
batadv_v_elp_throughput_metric_update work is
finished.
But there isn't a guarantee that the hard if will
remain associated with a soft interface up until
the work is finished.
This fixes a crash triggered by reboot that looks
like this:
Call trace:
batadv_v_mesh_free+0xd0/0x4dc [batman_adv]
batadv_v_elp_throughput_metric_update+0x1c/0xa4
process_one_work+0x178/0x398
worker_thread+0x2e8/0x4d0
kthread+0xd8/0xdc
ret_from_fork+0x10/0x20
(the batadv_v_mesh_free call is misleading,
and does not actually happen)
I was able to make the issue happen more reliably
by changing hardif_neigh->bat_v.metric_work work
to be delayed work. This allowed me to track down
and confirm the fix.
Fixes: 5c3245172c01 ("batman-adv: ELP - compute the metric based on the estimated throughput")
Signed-off-by: Andy Strohman <andrew@andrewstrohman.com>
[sven@narfation.org: prevent entering batadv_v_elp_get_throughput without
soft_iface]
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Origin: upstream, https://git.open-mesh.org/batman-adv.git/commit/1d7c8ac629678b8cd23ef9def7c7b111cb9e8ed5
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -66,12 +66,19 @@ static void batadv_v_elp_start_timer(str
static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
{
struct batadv_hard_iface *hard_iface = neigh->if_incoming;
+ struct net_device *soft_iface = hard_iface->soft_iface;
struct ethtool_link_ksettings link_settings;
struct net_device *real_netdev;
struct station_info sinfo;
u32 throughput;
int ret;
+ /* don't query throughput when no longer associated with any
+ * batman-adv interface
+ */
+ if (!soft_iface)
+ return BATADV_THROUGHPUT_DEFAULT_VALUE;
+
/* if the user specified a customised value for this interface, then
* return it directly
*/
@@ -141,7 +148,7 @@ static u32 batadv_v_elp_get_throughput(s
default_throughput:
if (!(hard_iface->bat_v.flags & BATADV_WARNING_DEFAULT)) {
- batadv_info(hard_iface->soft_iface,
+ batadv_info(soft_iface,
"WiFi driver or ethtool info does not provide information about link speeds on interface %s, therefore defaulting to hardcoded throughput values of %u.%1u Mbps. Consider overriding the throughput manually or checking your driver.\n",
hard_iface->net_dev->name,
BATADV_THROUGHPUT_DEFAULT_VALUE / 10,

View File

@@ -0,0 +1,127 @@
From: Sven Eckelmann <sven@narfation.org>
Date: Wed, 22 Jan 2025 21:51:20 +0100
Subject: batman-adv: Ignore neighbor throughput metrics in error case
If a temporary error happened in the evaluation of the neighbor throughput
information, then the invalid throughput result should not be stored in the
throughtput EWMA.
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Origin: upstream, https://git.open-mesh.org/batman-adv.git/commit/ce8b40b7bdcc7f011191acda4c2d3067566eef54
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -59,11 +59,13 @@ static void batadv_v_elp_start_timer(str
/**
* batadv_v_elp_get_throughput() - get the throughput towards a neighbour
* @neigh: the neighbour for which the throughput has to be obtained
+ * @pthroughput: calculated throughput towards the given neighbour in multiples
+ * of 100kpbs (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc).
*
- * Return: The throughput towards the given neighbour in multiples of 100kpbs
- * (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc).
+ * Return: true when value behind @pthroughput was set
*/
-static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
+static bool batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh,
+ u32 *pthroughput)
{
struct batadv_hard_iface *hard_iface = neigh->if_incoming;
struct net_device *soft_iface = hard_iface->soft_iface;
@@ -77,14 +79,16 @@ static u32 batadv_v_elp_get_throughput(s
* batman-adv interface
*/
if (!soft_iface)
- return BATADV_THROUGHPUT_DEFAULT_VALUE;
+ return false;
/* if the user specified a customised value for this interface, then
* return it directly
*/
throughput = atomic_read(&hard_iface->bat_v.throughput_override);
- if (throughput != 0)
- return throughput;
+ if (throughput != 0) {
+ *pthroughput = throughput;
+ return true;
+ }
/* if this is a wireless device, then ask its throughput through
* cfg80211 API
@@ -111,19 +115,24 @@ static u32 batadv_v_elp_get_throughput(s
* possible to delete this neighbor. For now set
* the throughput metric to 0.
*/
- return 0;
+ *pthroughput = 0;
+ return true;
}
if (ret)
goto default_throughput;
- if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT))
- return sinfo.expected_throughput / 100;
+ if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT)) {
+ *pthroughput = sinfo.expected_throughput / 100;
+ return true;
+ }
/* try to estimate the expected throughput based on reported tx
* rates
*/
- if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE))
- return cfg80211_calculate_bitrate(&sinfo.txrate) / 3;
+ if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) {
+ *pthroughput = cfg80211_calculate_bitrate(&sinfo.txrate) / 3;
+ return true;
+ }
goto default_throughput;
}
@@ -142,8 +151,10 @@ static u32 batadv_v_elp_get_throughput(s
hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX;
throughput = link_settings.base.speed;
- if (throughput && throughput != SPEED_UNKNOWN)
- return throughput * 10;
+ if (throughput && throughput != SPEED_UNKNOWN) {
+ *pthroughput = throughput * 10;
+ return true;
+ }
}
default_throughput:
@@ -157,7 +168,8 @@ default_throughput:
}
/* if none of the above cases apply, return the base_throughput */
- return BATADV_THROUGHPUT_DEFAULT_VALUE;
+ *pthroughput = BATADV_THROUGHPUT_DEFAULT_VALUE;
+ return true;
}
/**
@@ -169,15 +181,21 @@ void batadv_v_elp_throughput_metric_upda
{
struct batadv_hardif_neigh_node_bat_v *neigh_bat_v;
struct batadv_hardif_neigh_node *neigh;
+ u32 throughput;
+ bool valid;
neigh_bat_v = container_of(work, struct batadv_hardif_neigh_node_bat_v,
metric_work);
neigh = container_of(neigh_bat_v, struct batadv_hardif_neigh_node,
bat_v);
- ewma_throughput_add(&neigh->bat_v.throughput,
- batadv_v_elp_get_throughput(neigh));
+ valid = batadv_v_elp_get_throughput(neigh, &throughput);
+ if (!valid)
+ goto put_neigh;
+
+ ewma_throughput_add(&neigh->bat_v.throughput, throughput);
+put_neigh:
/* decrement refcounter to balance increment performed before scheduling
* this task
*/

View File

@@ -0,0 +1,239 @@
From: Sven Eckelmann <sven@narfation.org>
Date: Wed, 22 Jan 2025 21:51:21 +0100
Subject: batman-adv: Drop unmanaged ELP metric worker
The ELP worker needs to calculate new metric values for all neighbors
"reachable" over an interface. Some of the used metric sources require
locks which might need to sleep. This sleep is incompatible with the RCU
list iterator used for the recorded neighbors. The initial approach to work
around of this problem was to queue another work item per neighbor and then
run this in a new context.
Even when this solved the RCU vs might_sleep() conflict, it has a major
problems: Nothing was stopping the work item in case it is not needed
anymore - for example because one of the related interfaces was removed or
the batman-adv module was unloaded - resulting in potential invalid memory
accesses.
Directly canceling the metric worker also has various problems:
* cancel_work_sync for a to-be-deactivated interface is called with
rtnl_lock held. But the code in the ELP metric worker also tries to use
rtnl_lock() - which will never return in this case. This also means that
cancel_work_sync would never return because it is waiting for the worker
to finish.
* iterating over the neighbor list for the to-be-deactivated interface is
currently done using the RCU specific methods. Which means that it is
possible to miss items when iterating over it without the associated
spinlock - a behaviour which is acceptable for a periodic metric check
but not for a cleanup routine (which must "stop" all still running
workers)
The better approch is to get rid of the per interface neighbor metric
worker and handle everything in the interface worker. The original problems
are solved by:
* creating a list of neighbors which require new metric information inside
the RCU protected context, gathering the metric according to the new list
outside the RCU protected context
* only use rcu_trylock inside metric gathering code to avoid a deadlock
when the cancel_delayed_work_sync is called in the interface removal code
(which is called with the rtnl_lock held)
Fixes: 5c3245172c01 ("batman-adv: ELP - compute the metric based on the estimated throughput")
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Origin: upstream, https://git.open-mesh.org/batman-adv.git/commit/405d49a20a205799ec453cadb3d078dc2808d563
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -113,8 +113,6 @@ static void
batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh)
{
ewma_throughput_init(&hardif_neigh->bat_v.throughput);
- INIT_WORK(&hardif_neigh->bat_v.metric_work,
- batadv_v_elp_throughput_metric_update);
}
/**
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -18,6 +18,7 @@
#include <linux/if_ether.h>
#include <linux/jiffies.h>
#include <linux/kref.h>
+#include <linux/list.h>
#include <linux/minmax.h>
#include <linux/netdevice.h>
#include <linux/nl80211.h>
@@ -26,6 +27,7 @@
#include <linux/rcupdate.h>
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/types.h>
@@ -42,6 +44,18 @@
#include "send.h"
/**
+ * struct batadv_v_metric_queue_entry - list of hardif neighbors which require
+ * and metric update
+ */
+struct batadv_v_metric_queue_entry {
+ /** @hardif_neigh: hardif neighbor scheduled for metric update */
+ struct batadv_hardif_neigh_node *hardif_neigh;
+
+ /** @list: list node for metric_queue */
+ struct list_head list;
+};
+
+/**
* batadv_v_elp_start_timer() - restart timer for ELP periodic work
* @hard_iface: the interface for which the timer has to be reset
*/
@@ -137,10 +151,17 @@ static bool batadv_v_elp_get_throughput(
goto default_throughput;
}
+ /* only use rtnl_trylock because the elp worker will be cancelled while
+ * the rntl_lock is held. the cancel_delayed_work_sync() would otherwise
+ * wait forever when the elp work_item was started and it is then also
+ * trying to rtnl_lock
+ */
+ if (!rtnl_trylock())
+ return false;
+
/* if not a wifi interface, check if this device provides data via
* ethtool (e.g. an Ethernet adapter)
*/
- rtnl_lock();
ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings);
rtnl_unlock();
if (ret == 0) {
@@ -175,31 +196,19 @@ default_throughput:
/**
* batadv_v_elp_throughput_metric_update() - worker updating the throughput
* metric of a single hop neighbour
- * @work: the work queue item
+ * @neigh: the neighbour to probe
*/
-void batadv_v_elp_throughput_metric_update(struct work_struct *work)
+static void
+batadv_v_elp_throughput_metric_update(struct batadv_hardif_neigh_node *neigh)
{
- struct batadv_hardif_neigh_node_bat_v *neigh_bat_v;
- struct batadv_hardif_neigh_node *neigh;
u32 throughput;
bool valid;
- neigh_bat_v = container_of(work, struct batadv_hardif_neigh_node_bat_v,
- metric_work);
- neigh = container_of(neigh_bat_v, struct batadv_hardif_neigh_node,
- bat_v);
-
valid = batadv_v_elp_get_throughput(neigh, &throughput);
if (!valid)
- goto put_neigh;
+ return;
ewma_throughput_add(&neigh->bat_v.throughput, throughput);
-
-put_neigh:
- /* decrement refcounter to balance increment performed before scheduling
- * this task
- */
- batadv_hardif_neigh_put(neigh);
}
/**
@@ -273,14 +282,16 @@ batadv_v_elp_wifi_neigh_probe(struct bat
*/
static void batadv_v_elp_periodic_work(struct work_struct *work)
{
+ struct batadv_v_metric_queue_entry *metric_entry;
+ struct batadv_v_metric_queue_entry *metric_safe;
struct batadv_hardif_neigh_node *hardif_neigh;
struct batadv_hard_iface *hard_iface;
struct batadv_hard_iface_bat_v *bat_v;
struct batadv_elp_packet *elp_packet;
+ struct list_head metric_queue;
struct batadv_priv *bat_priv;
struct sk_buff *skb;
u32 elp_interval;
- bool ret;
bat_v = container_of(work, struct batadv_hard_iface_bat_v, elp_wq.work);
hard_iface = container_of(bat_v, struct batadv_hard_iface, bat_v);
@@ -316,6 +327,8 @@ static void batadv_v_elp_periodic_work(s
atomic_inc(&hard_iface->bat_v.elp_seqno);
+ INIT_LIST_HEAD(&metric_queue);
+
/* The throughput metric is updated on each sent packet. This way, if a
* node is dead and no longer sends packets, batman-adv is still able to
* react timely to its death.
@@ -340,16 +353,28 @@ static void batadv_v_elp_periodic_work(s
/* Reading the estimated throughput from cfg80211 is a task that
* may sleep and that is not allowed in an rcu protected
- * context. Therefore schedule a task for that.
+ * context. Therefore add it to metric_queue and process it
+ * outside rcu protected context.
*/
- ret = queue_work(batadv_event_workqueue,
- &hardif_neigh->bat_v.metric_work);
-
- if (!ret)
+ metric_entry = kzalloc(sizeof(*metric_entry), GFP_ATOMIC);
+ if (!metric_entry) {
batadv_hardif_neigh_put(hardif_neigh);
+ continue;
+ }
+
+ metric_entry->hardif_neigh = hardif_neigh;
+ list_add(&metric_entry->list, &metric_queue);
}
rcu_read_unlock();
+ list_for_each_entry_safe(metric_entry, metric_safe, &metric_queue, list) {
+ batadv_v_elp_throughput_metric_update(metric_entry->hardif_neigh);
+
+ batadv_hardif_neigh_put(metric_entry->hardif_neigh);
+ list_del(&metric_entry->list);
+ kfree(metric_entry);
+ }
+
restart_timer:
batadv_v_elp_start_timer(hard_iface);
out:
--- a/net/batman-adv/bat_v_elp.h
+++ b/net/batman-adv/bat_v_elp.h
@@ -10,7 +10,6 @@
#include "main.h"
#include <linux/skbuff.h>
-#include <linux/workqueue.h>
int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface);
void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface);
@@ -19,6 +18,5 @@ void batadv_v_elp_iface_activate(struct
void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface);
int batadv_v_elp_packet_recv(struct sk_buff *skb,
struct batadv_hard_iface *if_incoming);
-void batadv_v_elp_throughput_metric_update(struct work_struct *work);
#endif /* _NET_BATMAN_ADV_BAT_V_ELP_H_ */
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -596,9 +596,6 @@ struct batadv_hardif_neigh_node_bat_v {
* neighbor
*/
unsigned long last_unicast_tx;
-
- /** @metric_work: work queue callback item for metric update */
- struct work_struct metric_work;
};
/**