[PATCHv3 net-next 25/31] iw_cxgb4: endpoint timeout fixes.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Steve Wise <swise@xxxxxxxxxxxxxxxxxxxxx>

1) timedout endpoint processing can be starved. If there is continual
CPL messages flowing into the driver, the endpoint timeout processing
can be starved.  This condition exposed the other bugs below.

Solution: In process_work(), call process_timedout_eps() after each CPL
is processed.

2) Connection events can be processed even though the endpoint is on
the timeout list.  If the endpoint is scheduled for timeout processing,
then we must ignore MPA Start Requests and Replies.

Solution: Change stop_ep_timer() to return 1 if the ep has already been
queued for timeout processing.  All the callers of stop_ep_timer() need
to check this and act accordingly.  There are just a few cases where
the caller needs to do something different if stop_ep_timer() returns 1:

1) in process_mpa_reply(), ignore the reply and  process_timeout()
will abort the connection.

2) in process_mpa_request, ignore the request and process_timeout()
will abort the connection.

It is ok for callers of stop_ep_timer() to abort the connection since
that will leave the state in ABORTING or DEAD, and process_timeout()
now ignores timeouts when the ep is in these states.

3) Double insertion on the timeout list.  Since the endpoint timers are
used for connection setup and teardown, we need to guard against the
possibility that an endpoint is already on the timeout list.  This is
a rare condition and only seen under heavy load and in the presense of
the above 2 bugs.

Solution: In ep_timeout(), don't queue the endpoint if it is already on
the queue.

Signed-off-by: Steve Wise <swise@xxxxxxxxxxxxxxxxxxxxx>
---
 drivers/infiniband/hw/cxgb4/cm.c | 89 +++++++++++++++++++++++++---------------
 1 file changed, 56 insertions(+), 33 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 31e725e..64126bb 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -178,12 +178,15 @@ static void start_ep_timer(struct c4iw_ep *ep)
 	add_timer(&ep->timer);
 }
 
-static void stop_ep_timer(struct c4iw_ep *ep)
+static int stop_ep_timer(struct c4iw_ep *ep)
 {
 	PDBG("%s ep %p stopping\n", __func__, ep);
 	del_timer_sync(&ep->timer);
-	if (!test_and_set_bit(TIMEOUT, &ep->com.flags))
+	if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) {
 		c4iw_put_ep(&ep->com);
+		return 0;
+	}
+	return 1;
 }
 
 static int c4iw_l2t_send(struct c4iw_rdev *rdev, struct sk_buff *skb,
@@ -1188,12 +1191,11 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
 	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
 
 	/*
-	 * Stop mpa timer.  If it expired, then the state has
-	 * changed and we bail since ep_timeout already aborted
-	 * the connection.
+	 * Stop mpa timer.  If it expired, then
+	 * we ignore the MPA reply.  process_timeout()
+	 * will abort the connection.
 	 */
-	stop_ep_timer(ep);
-	if (ep->com.state != MPA_REQ_SENT)
+	if (stop_ep_timer(ep))
 		return;
 
 	/*
@@ -1398,15 +1400,12 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
 
 	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
 
-	if (ep->com.state != MPA_REQ_WAIT)
-		return;
-
 	/*
 	 * If we get more than the supported amount of private data
 	 * then we must fail this connection.
 	 */
 	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
-		stop_ep_timer(ep);
+		(void)stop_ep_timer(ep);
 		abort_connection(ep, skb, GFP_KERNEL);
 		return;
 	}
@@ -1436,13 +1435,13 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
 	if (mpa->revision > mpa_rev) {
 		printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d,"
 		       " Received = %d\n", __func__, mpa_rev, mpa->revision);
-		stop_ep_timer(ep);
+		(void)stop_ep_timer(ep);
 		abort_connection(ep, skb, GFP_KERNEL);
 		return;
 	}
 
 	if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) {
-		stop_ep_timer(ep);
+		(void)stop_ep_timer(ep);
 		abort_connection(ep, skb, GFP_KERNEL);
 		return;
 	}
@@ -1453,7 +1452,7 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
 	 * Fail if there's too much private data.
 	 */
 	if (plen > MPA_MAX_PRIVATE_DATA) {
-		stop_ep_timer(ep);
+		(void)stop_ep_timer(ep);
 		abort_connection(ep, skb, GFP_KERNEL);
 		return;
 	}
@@ -1462,7 +1461,7 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
 	 * If plen does not account for pkt size
 	 */
 	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
-		stop_ep_timer(ep);
+		(void)stop_ep_timer(ep);
 		abort_connection(ep, skb, GFP_KERNEL);
 		return;
 	}
@@ -1519,18 +1518,24 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
 	     ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version,
 	     ep->mpa_attr.p2p_type);
 
-	__state_set(&ep->com, MPA_REQ_RCVD);
-	stop_ep_timer(ep);
-
-	/* drive upcall */
-	mutex_lock(&ep->parent_ep->com.mutex);
-	if (ep->parent_ep->com.state != DEAD) {
-		if (connect_request_upcall(ep))
+	/*
+	 * If the endpoint timer already expired, then we ignore
+	 * the start request.  process_timeout() will abort
+	 * the connection.
+	 */
+	if (!stop_ep_timer(ep)) {
+		__state_set(&ep->com, MPA_REQ_RCVD);
+
+		/* drive upcall */
+		mutex_lock(&ep->parent_ep->com.mutex);
+		if (ep->parent_ep->com.state != DEAD) {
+			if (connect_request_upcall(ep))
+				abort_connection(ep, skb, GFP_KERNEL);
+		} else {
 			abort_connection(ep, skb, GFP_KERNEL);
-	} else {
-		abort_connection(ep, skb, GFP_KERNEL);
+		}
+		mutex_unlock(&ep->parent_ep->com.mutex);
 	}
-	mutex_unlock(&ep->parent_ep->com.mutex);
 	return;
 }
 
@@ -2320,7 +2325,7 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb)
 		disconnect = 0;
 		break;
 	case MORIBUND:
-		stop_ep_timer(ep);
+		(void)stop_ep_timer(ep);
 		if (ep->com.cm_id && ep->com.qp) {
 			attrs.next_state = C4IW_QP_STATE_IDLE;
 			c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
@@ -2380,10 +2385,10 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
 	case CONNECTING:
 		break;
 	case MPA_REQ_WAIT:
-		stop_ep_timer(ep);
+		(void)stop_ep_timer(ep);
 		break;
 	case MPA_REQ_SENT:
-		stop_ep_timer(ep);
+		(void)stop_ep_timer(ep);
 		if (mpa_rev == 1 || (mpa_rev == 2 && ep->tried_with_mpa_v1))
 			connect_reply_upcall(ep, -ECONNRESET);
 		else {
@@ -2488,7 +2493,7 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
 		__state_set(&ep->com, MORIBUND);
 		break;
 	case MORIBUND:
-		stop_ep_timer(ep);
+		(void)stop_ep_timer(ep);
 		if ((ep->com.cm_id) && (ep->com.qp)) {
 			attrs.next_state = C4IW_QP_STATE_IDLE;
 			c4iw_modify_qp(ep->com.qp->rhp,
@@ -3083,7 +3088,7 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
 		if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) {
 			close = 1;
 			if (abrupt) {
-				stop_ep_timer(ep);
+				(void)stop_ep_timer(ep);
 				ep->com.state = ABORTING;
 			} else
 				ep->com.state = MORIBUND;
@@ -3518,6 +3523,16 @@ static void process_timeout(struct c4iw_ep *ep)
 		__state_set(&ep->com, ABORTING);
 		close_complete_upcall(ep, -ETIMEDOUT);
 		break;
+	case ABORTING:
+	case DEAD:
+
+		/*
+		 * These states are expected if the ep timed out at the same
+		 * time as another thread was calling stop_ep_timer().
+		 * So we silently do nothing for these states.
+		 */
+		abort = 0;
+		break;
 	default:
 		WARN(1, "%s unexpected state ep %p tid %u state %u\n",
 			__func__, ep, ep->hwtid, ep->com.state);
@@ -3539,6 +3554,8 @@ static void process_timedout_eps(void)
 
 		tmp = timeout_list.next;
 		list_del(tmp);
+		tmp->next = NULL;
+		tmp->prev = NULL;
 		spin_unlock_irq(&timeout_lock);
 		ep = list_entry(tmp, struct c4iw_ep, entry);
 		process_timeout(ep);
@@ -3555,6 +3572,7 @@ static void process_work(struct work_struct *work)
 	unsigned int opcode;
 	int ret;
 
+	process_timedout_eps();
 	while ((skb = skb_dequeue(&rxq))) {
 		rpl = cplhdr(skb);
 		dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *)));
@@ -3564,8 +3582,8 @@ static void process_work(struct work_struct *work)
 		ret = work_handlers[opcode](dev, skb);
 		if (!ret)
 			kfree_skb(skb);
+		process_timedout_eps();
 	}
-	process_timedout_eps();
 }
 
 static DECLARE_WORK(skb_work, process_work);
@@ -3577,8 +3595,13 @@ static void ep_timeout(unsigned long arg)
 
 	spin_lock(&timeout_lock);
 	if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) {
-		list_add_tail(&ep->entry, &timeout_list);
-		kickit = 1;
+		/*
+		 * Only insert if it is not already on the list.
+		 */
+		if (!ep->entry.next) {
+			list_add_tail(&ep->entry, &timeout_list);
+			kickit = 1;
+		}
 	}
 	spin_unlock(&timeout_lock);
 	if (kickit)
-- 
1.8.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Kernel Discussion]     [TCP Instrumentation]     [Ethernet Bridging]     [Linux Wireless Networking]     [Linux WPAN Networking]     [Linux Host AP]     [Linux WPAN Networking]     [Linux Bluetooth Networking]     [Linux ATH6KL Networking]     [Linux Networking Users]     [Linux Coverity]     [VLAN]     [Git]     [IETF Annouce]     [Linux Assembly]     [Security]     [Bugtraq]     [Yosemite Information]     [MIPS Linux]     [ARM Linux Kernel]     [ARM Linux]     [Linux Virtualization]     [Linux IDE]     [Linux RAID]     [Linux SCSI]