int hfi1_make_rc_req()

in hw/hfi1/rc.c [388:1178]


int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
{
	struct hfi1_qp_priv *priv = qp->priv;
	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
	struct ib_other_headers *ohdr;
	struct rvt_sge_state *ss = NULL;
	struct rvt_swqe *wqe;
	struct hfi1_swqe_priv *wpriv;
	struct tid_rdma_request *req = NULL;
	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
	u32 hwords = 5;
	u32 len = 0;
	u32 bth0 = 0, bth2 = 0;
	u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
	u32 pmtu = qp->pmtu;
	char newreq;
	int middle = 0;
	int delta;
	struct tid_rdma_flow *flow = NULL;
	struct tid_rdma_params *remote;

	trace_hfi1_sender_make_rc_req(qp);
	lockdep_assert_held(&qp->s_lock);
	ps->s_txreq = get_txreq(ps->dev, qp);
	if (!ps->s_txreq)
		goto bail_no_tx;

	if (priv->hdr_type == HFI1_PKT_TYPE_9B) {
		/* header size in 32-bit words LRH+BTH = (8+12)/4. */
		hwords = 5;
		if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)
			ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth;
		else
			ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
	} else {
		/* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */
		hwords = 7;
		if ((rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) &&
		    (hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))))
			ohdr = &ps->s_txreq->phdr.hdr.opah.u.l.oth;
		else
			ohdr = &ps->s_txreq->phdr.hdr.opah.u.oth;
	}

	/* Sending responses has higher priority over sending requests. */
	if ((qp->s_flags & RVT_S_RESP_PENDING) &&
	    make_rc_ack(dev, qp, ohdr, ps))
		return 1;

	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
		if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
			goto bail;
		/* We are in the error state, flush the work request. */
		if (qp->s_last == READ_ONCE(qp->s_head))
			goto bail;
		/* If DMAs are in progress, we can't flush immediately. */
		if (iowait_sdma_pending(&priv->s_iowait)) {
			qp->s_flags |= RVT_S_WAIT_DMA;
			goto bail;
		}
		clear_ahg(qp);
		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
		hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
					 IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
		/* will get called again */
		goto done_free_tx;
	}

	if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT))
		goto bail;

	if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
		if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
			qp->s_flags |= RVT_S_WAIT_PSN;
			goto bail;
		}
		qp->s_sending_psn = qp->s_psn;
		qp->s_sending_hpsn = qp->s_psn - 1;
	}

	/* Send a request. */
	wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
check_s_state:
	switch (qp->s_state) {
	default:
		if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
			goto bail;
		/*
		 * Resend an old request or start a new one.
		 *
		 * We keep track of the current SWQE so that
		 * we don't reset the "furthest progress" state
		 * if we need to back up.
		 */
		newreq = 0;
		if (qp->s_cur == qp->s_tail) {
			/* Check if send work queue is empty. */
			if (qp->s_tail == READ_ONCE(qp->s_head)) {
				clear_ahg(qp);
				goto bail;
			}
			/*
			 * If a fence is requested, wait for previous
			 * RDMA read and atomic operations to finish.
			 * However, there is no need to guard against
			 * TID RDMA READ after TID RDMA READ.
			 */
			if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
			    qp->s_num_rd_atomic &&
			    (wqe->wr.opcode != IB_WR_TID_RDMA_READ ||
			     priv->pending_tid_r_segs < qp->s_num_rd_atomic)) {
				qp->s_flags |= RVT_S_WAIT_FENCE;
				goto bail;
			}
			/*
			 * Local operations are processed immediately
			 * after all prior requests have completed
			 */
			if (wqe->wr.opcode == IB_WR_REG_MR ||
			    wqe->wr.opcode == IB_WR_LOCAL_INV) {
				int local_ops = 0;
				int err = 0;

				if (qp->s_last != qp->s_cur)
					goto bail;
				if (++qp->s_cur == qp->s_size)
					qp->s_cur = 0;
				if (++qp->s_tail == qp->s_size)
					qp->s_tail = 0;
				if (!(wqe->wr.send_flags &
				      RVT_SEND_COMPLETION_ONLY)) {
					err = rvt_invalidate_rkey(
						qp,
						wqe->wr.ex.invalidate_rkey);
					local_ops = 1;
				}
				rvt_send_complete(qp, wqe,
						  err ? IB_WC_LOC_PROT_ERR
						      : IB_WC_SUCCESS);
				if (local_ops)
					atomic_dec(&qp->local_ops_pending);
				goto done_free_tx;
			}

			newreq = 1;
			qp->s_psn = wqe->psn;
		}
		/*
		 * Note that we have to be careful not to modify the
		 * original work request since we may need to resend
		 * it.
		 */
		len = wqe->length;
		ss = &qp->s_sge;
		bth2 = mask_psn(qp->s_psn);

		/*
		 * Interlock between various IB requests and TID RDMA
		 * if necessary.
		 */
		if ((priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) ||
		    hfi1_tid_rdma_wqe_interlock(qp, wqe))
			goto bail;

		switch (wqe->wr.opcode) {
		case IB_WR_SEND:
		case IB_WR_SEND_WITH_IMM:
		case IB_WR_SEND_WITH_INV:
			/* If no credit, return. */
			if (!rvt_rc_credit_avail(qp, wqe))
				goto bail;
			if (len > pmtu) {
				qp->s_state = OP(SEND_FIRST);
				len = pmtu;
				break;
			}
			if (wqe->wr.opcode == IB_WR_SEND) {
				qp->s_state = OP(SEND_ONLY);
			} else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
				qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
				/* Immediate data comes after the BTH */
				ohdr->u.imm_data = wqe->wr.ex.imm_data;
				hwords += 1;
			} else {
				qp->s_state = OP(SEND_ONLY_WITH_INVALIDATE);
				/* Invalidate rkey comes after the BTH */
				ohdr->u.ieth = cpu_to_be32(
						wqe->wr.ex.invalidate_rkey);
				hwords += 1;
			}
			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
				bth0 |= IB_BTH_SOLICITED;
			bth2 |= IB_BTH_REQ_ACK;
			if (++qp->s_cur == qp->s_size)
				qp->s_cur = 0;
			break;

		case IB_WR_RDMA_WRITE:
			if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
				qp->s_lsn++;
			goto no_flow_control;
		case IB_WR_RDMA_WRITE_WITH_IMM:
			/* If no credit, return. */
			if (!rvt_rc_credit_avail(qp, wqe))
				goto bail;
no_flow_control:
			put_ib_reth_vaddr(
				wqe->rdma_wr.remote_addr,
				&ohdr->u.rc.reth);
			ohdr->u.rc.reth.rkey =
				cpu_to_be32(wqe->rdma_wr.rkey);
			ohdr->u.rc.reth.length = cpu_to_be32(len);
			hwords += sizeof(struct ib_reth) / sizeof(u32);
			if (len > pmtu) {
				qp->s_state = OP(RDMA_WRITE_FIRST);
				len = pmtu;
				break;
			}
			if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
				qp->s_state = OP(RDMA_WRITE_ONLY);
			} else {
				qp->s_state =
					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
				/* Immediate data comes after RETH */
				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
				hwords += 1;
				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
					bth0 |= IB_BTH_SOLICITED;
			}
			bth2 |= IB_BTH_REQ_ACK;
			if (++qp->s_cur == qp->s_size)
				qp->s_cur = 0;
			break;

		case IB_WR_TID_RDMA_WRITE:
			if (newreq) {
				/*
				 * Limit the number of TID RDMA WRITE requests.
				 */
				if (atomic_read(&priv->n_tid_requests) >=
				    HFI1_TID_RDMA_WRITE_CNT)
					goto bail;

				if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
					qp->s_lsn++;
			}

			hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr,
								&bth1, &bth2,
								&len);
			ss = NULL;
			if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) {
				priv->s_tid_cur = qp->s_cur;
				if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) {
					priv->s_tid_tail = qp->s_cur;
					priv->s_state = TID_OP(WRITE_RESP);
				}
			} else if (priv->s_tid_cur == priv->s_tid_head) {
				struct rvt_swqe *__w;
				struct tid_rdma_request *__r;

				__w = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
				__r = wqe_to_tid_req(__w);

				/*
				 * The s_tid_cur pointer is advanced to s_cur if
				 * any of the following conditions about the WQE
				 * to which s_ti_cur currently points to are
				 * satisfied:
				 *   1. The request is not a TID RDMA WRITE
				 *      request,
				 *   2. The request is in the INACTIVE or
				 *      COMPLETE states (TID RDMA READ requests
				 *      stay at INACTIVE and TID RDMA WRITE
				 *      transition to COMPLETE when done),
				 *   3. The request is in the ACTIVE or SYNC
				 *      state and the number of completed
				 *      segments is equal to the total segment
				 *      count.
				 *      (If ACTIVE, the request is waiting for
				 *       ACKs. If SYNC, the request has not
				 *       received any responses because it's
				 *       waiting on a sync point.)
				 */
				if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE ||
				    __r->state == TID_REQUEST_INACTIVE ||
				    __r->state == TID_REQUEST_COMPLETE ||
				    ((__r->state == TID_REQUEST_ACTIVE ||
				      __r->state == TID_REQUEST_SYNC) &&
				     __r->comp_seg == __r->total_segs)) {
					if (priv->s_tid_tail ==
					    priv->s_tid_cur &&
					    priv->s_state ==
					    TID_OP(WRITE_DATA_LAST)) {
						priv->s_tid_tail = qp->s_cur;
						priv->s_state =
							TID_OP(WRITE_RESP);
					}
					priv->s_tid_cur = qp->s_cur;
				}
				/*
				 * A corner case: when the last TID RDMA WRITE
				 * request was completed, s_tid_head,
				 * s_tid_cur, and s_tid_tail all point to the
				 * same location. Other requests are posted and
				 * s_cur wraps around to the same location,
				 * where a new TID RDMA WRITE is posted. In
				 * this case, none of the indices need to be
				 * updated. However, the priv->s_state should.
				 */
				if (priv->s_tid_tail == qp->s_cur &&
				    priv->s_state == TID_OP(WRITE_DATA_LAST))
					priv->s_state = TID_OP(WRITE_RESP);
			}
			req = wqe_to_tid_req(wqe);
			if (newreq) {
				priv->s_tid_head = qp->s_cur;
				priv->pending_tid_w_resp += req->total_segs;
				atomic_inc(&priv->n_tid_requests);
				atomic_dec(&priv->n_requests);
			} else {
				req->state = TID_REQUEST_RESEND;
				req->comp_seg = delta_psn(bth2, wqe->psn);
				/*
				 * Pull back any segments since we are going
				 * to re-receive them.
				 */
				req->setup_head = req->clear_tail;
				priv->pending_tid_w_resp +=
					delta_psn(wqe->lpsn, bth2) + 1;
			}

			trace_hfi1_tid_write_sender_make_req(qp, newreq);
			trace_hfi1_tid_req_make_req_write(qp, newreq,
							  wqe->wr.opcode,
							  wqe->psn, wqe->lpsn,
							  req);
			if (++qp->s_cur == qp->s_size)
				qp->s_cur = 0;
			break;

		case IB_WR_RDMA_READ:
			/*
			 * Don't allow more operations to be started
			 * than the QP limits allow.
			 */
			if (qp->s_num_rd_atomic >=
			    qp->s_max_rd_atomic) {
				qp->s_flags |= RVT_S_WAIT_RDMAR;
				goto bail;
			}
			qp->s_num_rd_atomic++;
			if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
				qp->s_lsn++;
			put_ib_reth_vaddr(
				wqe->rdma_wr.remote_addr,
				&ohdr->u.rc.reth);
			ohdr->u.rc.reth.rkey =
				cpu_to_be32(wqe->rdma_wr.rkey);
			ohdr->u.rc.reth.length = cpu_to_be32(len);
			qp->s_state = OP(RDMA_READ_REQUEST);
			hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
			ss = NULL;
			len = 0;
			bth2 |= IB_BTH_REQ_ACK;
			if (++qp->s_cur == qp->s_size)
				qp->s_cur = 0;
			break;

		case IB_WR_TID_RDMA_READ:
			trace_hfi1_tid_read_sender_make_req(qp, newreq);
			wpriv = wqe->priv;
			req = wqe_to_tid_req(wqe);
			trace_hfi1_tid_req_make_req_read(qp, newreq,
							 wqe->wr.opcode,
							 wqe->psn, wqe->lpsn,
							 req);
			delta = cmp_psn(qp->s_psn, wqe->psn);

			/*
			 * Don't allow more operations to be started
			 * than the QP limits allow. We could get here under
			 * three conditions; (1) It's a new request; (2) We are
			 * sending the second or later segment of a request,
			 * but the qp->s_state is set to OP(RDMA_READ_REQUEST)
			 * when the last segment of a previous request is
			 * received just before this; (3) We are re-sending a
			 * request.
			 */
			if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
				qp->s_flags |= RVT_S_WAIT_RDMAR;
				goto bail;
			}
			if (newreq) {
				struct tid_rdma_flow *flow =
					&req->flows[req->setup_head];

				/*
				 * Set up s_sge as it is needed for TID
				 * allocation. However, if the pages have been
				 * walked and mapped, skip it. An earlier try
				 * has failed to allocate the TID entries.
				 */
				if (!flow->npagesets) {
					qp->s_sge.sge = wqe->sg_list[0];
					qp->s_sge.sg_list = wqe->sg_list + 1;
					qp->s_sge.num_sge = wqe->wr.num_sge;
					qp->s_sge.total_len = wqe->length;
					qp->s_len = wqe->length;
					req->isge = 0;
					req->clear_tail = req->setup_head;
					req->flow_idx = req->setup_head;
					req->state = TID_REQUEST_ACTIVE;
				}
			} else if (delta == 0) {
				/* Re-send a request */
				req->cur_seg = 0;
				req->comp_seg = 0;
				req->ack_pending = 0;
				req->flow_idx = req->clear_tail;
				req->state = TID_REQUEST_RESEND;
			}
			req->s_next_psn = qp->s_psn;
			/* Read one segment at a time */
			len = min_t(u32, req->seg_len,
				    wqe->length - req->seg_len * req->cur_seg);
			delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr,
							     &bth1, &bth2,
							     &len);
			if (delta <= 0) {
				/* Wait for TID space */
				goto bail;
			}
			if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
				qp->s_lsn++;
			hwords += delta;
			ss = &wpriv->ss;
			/* Check if this is the last segment */
			if (req->cur_seg >= req->total_segs &&
			    ++qp->s_cur == qp->s_size)
				qp->s_cur = 0;
			break;

		case IB_WR_ATOMIC_CMP_AND_SWP:
		case IB_WR_ATOMIC_FETCH_AND_ADD:
			/*
			 * Don't allow more operations to be started
			 * than the QP limits allow.
			 */
			if (qp->s_num_rd_atomic >=
			    qp->s_max_rd_atomic) {
				qp->s_flags |= RVT_S_WAIT_RDMAR;
				goto bail;
			}
			qp->s_num_rd_atomic++;
			fallthrough;
		case IB_WR_OPFN:
			if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
				qp->s_lsn++;
			if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
			    wqe->wr.opcode == IB_WR_OPFN) {
				qp->s_state = OP(COMPARE_SWAP);
				put_ib_ateth_swap(wqe->atomic_wr.swap,
						  &ohdr->u.atomic_eth);
				put_ib_ateth_compare(wqe->atomic_wr.compare_add,
						     &ohdr->u.atomic_eth);
			} else {
				qp->s_state = OP(FETCH_ADD);
				put_ib_ateth_swap(wqe->atomic_wr.compare_add,
						  &ohdr->u.atomic_eth);
				put_ib_ateth_compare(0, &ohdr->u.atomic_eth);
			}
			put_ib_ateth_vaddr(wqe->atomic_wr.remote_addr,
					   &ohdr->u.atomic_eth);
			ohdr->u.atomic_eth.rkey = cpu_to_be32(
				wqe->atomic_wr.rkey);
			hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
			ss = NULL;
			len = 0;
			bth2 |= IB_BTH_REQ_ACK;
			if (++qp->s_cur == qp->s_size)
				qp->s_cur = 0;
			break;

		default:
			goto bail;
		}
		if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) {
			qp->s_sge.sge = wqe->sg_list[0];
			qp->s_sge.sg_list = wqe->sg_list + 1;
			qp->s_sge.num_sge = wqe->wr.num_sge;
			qp->s_sge.total_len = wqe->length;
			qp->s_len = wqe->length;
		}
		if (newreq) {
			qp->s_tail++;
			if (qp->s_tail >= qp->s_size)
				qp->s_tail = 0;
		}
		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
		    wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
			qp->s_psn = wqe->lpsn + 1;
		else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
			qp->s_psn = req->s_next_psn;
		else
			qp->s_psn++;
		break;

	case OP(RDMA_READ_RESPONSE_FIRST):
		/*
		 * qp->s_state is normally set to the opcode of the
		 * last packet constructed for new requests and therefore
		 * is never set to RDMA read response.
		 * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
		 * thread to indicate a SEND needs to be restarted from an
		 * earlier PSN without interfering with the sending thread.
		 * See restart_rc().
		 */
		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
		fallthrough;
	case OP(SEND_FIRST):
		qp->s_state = OP(SEND_MIDDLE);
		fallthrough;
	case OP(SEND_MIDDLE):
		bth2 = mask_psn(qp->s_psn++);
		ss = &qp->s_sge;
		len = qp->s_len;
		if (len > pmtu) {
			len = pmtu;
			middle = HFI1_CAP_IS_KSET(SDMA_AHG);
			break;
		}
		if (wqe->wr.opcode == IB_WR_SEND) {
			qp->s_state = OP(SEND_LAST);
		} else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
			/* Immediate data comes after the BTH */
			ohdr->u.imm_data = wqe->wr.ex.imm_data;
			hwords += 1;
		} else {
			qp->s_state = OP(SEND_LAST_WITH_INVALIDATE);
			/* invalidate data comes after the BTH */
			ohdr->u.ieth = cpu_to_be32(wqe->wr.ex.invalidate_rkey);
			hwords += 1;
		}
		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
			bth0 |= IB_BTH_SOLICITED;
		bth2 |= IB_BTH_REQ_ACK;
		qp->s_cur++;
		if (qp->s_cur >= qp->s_size)
			qp->s_cur = 0;
		break;

	case OP(RDMA_READ_RESPONSE_LAST):
		/*
		 * qp->s_state is normally set to the opcode of the
		 * last packet constructed for new requests and therefore
		 * is never set to RDMA read response.
		 * RDMA_READ_RESPONSE_LAST is used by the ACK processing
		 * thread to indicate a RDMA write needs to be restarted from
		 * an earlier PSN without interfering with the sending thread.
		 * See restart_rc().
		 */
		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
		fallthrough;
	case OP(RDMA_WRITE_FIRST):
		qp->s_state = OP(RDMA_WRITE_MIDDLE);
		fallthrough;
	case OP(RDMA_WRITE_MIDDLE):
		bth2 = mask_psn(qp->s_psn++);
		ss = &qp->s_sge;
		len = qp->s_len;
		if (len > pmtu) {
			len = pmtu;
			middle = HFI1_CAP_IS_KSET(SDMA_AHG);
			break;
		}
		if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
			qp->s_state = OP(RDMA_WRITE_LAST);
		} else {
			qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
			/* Immediate data comes after the BTH */
			ohdr->u.imm_data = wqe->wr.ex.imm_data;
			hwords += 1;
			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
				bth0 |= IB_BTH_SOLICITED;
		}
		bth2 |= IB_BTH_REQ_ACK;
		qp->s_cur++;
		if (qp->s_cur >= qp->s_size)
			qp->s_cur = 0;
		break;

	case OP(RDMA_READ_RESPONSE_MIDDLE):
		/*
		 * qp->s_state is normally set to the opcode of the
		 * last packet constructed for new requests and therefore
		 * is never set to RDMA read response.
		 * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
		 * thread to indicate a RDMA read needs to be restarted from
		 * an earlier PSN without interfering with the sending thread.
		 * See restart_rc().
		 */
		len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu;
		put_ib_reth_vaddr(
			wqe->rdma_wr.remote_addr + len,
			&ohdr->u.rc.reth);
		ohdr->u.rc.reth.rkey =
			cpu_to_be32(wqe->rdma_wr.rkey);
		ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
		qp->s_state = OP(RDMA_READ_REQUEST);
		hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
		bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK;
		qp->s_psn = wqe->lpsn + 1;
		ss = NULL;
		len = 0;
		qp->s_cur++;
		if (qp->s_cur == qp->s_size)
			qp->s_cur = 0;
		break;

	case TID_OP(WRITE_RESP):
		/*
		 * This value for s_state is used for restarting a TID RDMA
		 * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE
		 * for more).
		 */
		req = wqe_to_tid_req(wqe);
		req->state = TID_REQUEST_RESEND;
		rcu_read_lock();
		remote = rcu_dereference(priv->tid_rdma.remote);
		req->comp_seg = delta_psn(qp->s_psn, wqe->psn);
		len = wqe->length - (req->comp_seg * remote->max_len);
		rcu_read_unlock();

		bth2 = mask_psn(qp->s_psn);
		hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1,
							&bth2, &len);
		qp->s_psn = wqe->lpsn + 1;
		ss = NULL;
		qp->s_state = TID_OP(WRITE_REQ);
		priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1;
		priv->s_tid_cur = qp->s_cur;
		if (++qp->s_cur == qp->s_size)
			qp->s_cur = 0;
		trace_hfi1_tid_req_make_req_write(qp, 0, wqe->wr.opcode,
						  wqe->psn, wqe->lpsn, req);
		break;

	case TID_OP(READ_RESP):
		if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
			goto bail;
		/* This is used to restart a TID read request */
		req = wqe_to_tid_req(wqe);
		wpriv = wqe->priv;
		/*
		 * Back down. The field qp->s_psn has been set to the psn with
		 * which the request should be restart. It's OK to use division
		 * as this is on the retry path.
		 */
		req->cur_seg = delta_psn(qp->s_psn, wqe->psn) / priv->pkts_ps;

		/*
		 * The following function need to be redefined to return the
		 * status to make sure that we find the flow. At the same
		 * time, we can use the req->state change to check if the
		 * call succeeds or not.
		 */
		req->state = TID_REQUEST_RESEND;
		hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
		if (req->state != TID_REQUEST_ACTIVE) {
			/*
			 * Failed to find the flow. Release all allocated tid
			 * resources.
			 */
			hfi1_kern_exp_rcv_clear_all(req);
			hfi1_kern_clear_hw_flow(priv->rcd, qp);

			hfi1_trdma_send_complete(qp, wqe, IB_WC_LOC_QP_OP_ERR);
			goto bail;
		}
		req->state = TID_REQUEST_RESEND;
		len = min_t(u32, req->seg_len,
			    wqe->length - req->seg_len * req->cur_seg);
		flow = &req->flows[req->flow_idx];
		len -= flow->sent;
		req->s_next_psn = flow->flow_state.ib_lpsn + 1;
		delta = hfi1_build_tid_rdma_read_packet(wqe, ohdr, &bth1,
							&bth2, &len);
		if (delta <= 0) {
			/* Wait for TID space */
			goto bail;
		}
		hwords += delta;
		ss = &wpriv->ss;
		/* Check if this is the last segment */
		if (req->cur_seg >= req->total_segs &&
		    ++qp->s_cur == qp->s_size)
			qp->s_cur = 0;
		qp->s_psn = req->s_next_psn;
		trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
						 wqe->psn, wqe->lpsn, req);
		break;
	case TID_OP(READ_REQ):
		req = wqe_to_tid_req(wqe);
		delta = cmp_psn(qp->s_psn, wqe->psn);
		/*
		 * If the current WR is not TID RDMA READ, or this is the start
		 * of a new request, we need to change the qp->s_state so that
		 * the request can be set up properly.
		 */
		if (wqe->wr.opcode != IB_WR_TID_RDMA_READ || delta == 0 ||
		    qp->s_cur == qp->s_tail) {
			qp->s_state = OP(RDMA_READ_REQUEST);
			if (delta == 0 || qp->s_cur == qp->s_tail)
				goto check_s_state;
			else
				goto bail;
		}

		/* Rate limiting */
		if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
			qp->s_flags |= RVT_S_WAIT_RDMAR;
			goto bail;
		}

		wpriv = wqe->priv;
		/* Read one segment at a time */
		len = min_t(u32, req->seg_len,
			    wqe->length - req->seg_len * req->cur_seg);
		delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, &bth1,
						     &bth2, &len);
		if (delta <= 0) {
			/* Wait for TID space */
			goto bail;
		}
		hwords += delta;
		ss = &wpriv->ss;
		/* Check if this is the last segment */
		if (req->cur_seg >= req->total_segs &&
		    ++qp->s_cur == qp->s_size)
			qp->s_cur = 0;
		qp->s_psn = req->s_next_psn;
		trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
						 wqe->psn, wqe->lpsn, req);
		break;
	}
	qp->s_sending_hpsn = bth2;
	delta = delta_psn(bth2, wqe->psn);
	if (delta && delta % HFI1_PSN_CREDIT == 0 &&
	    wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
		bth2 |= IB_BTH_REQ_ACK;
	if (qp->s_flags & RVT_S_SEND_ONE) {
		qp->s_flags &= ~RVT_S_SEND_ONE;
		qp->s_flags |= RVT_S_WAIT_ACK;
		bth2 |= IB_BTH_REQ_ACK;
	}
	qp->s_len -= len;
	ps->s_txreq->hdr_dwords = hwords;
	ps->s_txreq->sde = priv->s_sde;
	ps->s_txreq->ss = ss;
	ps->s_txreq->s_cur_size = len;
	hfi1_make_ruc_header(
		qp,
		ohdr,
		bth0 | (qp->s_state << 24),
		bth1,
		bth2,
		middle,
		ps);
	return 1;

done_free_tx:
	hfi1_put_txreq(ps->s_txreq);
	ps->s_txreq = NULL;
	return 1;

bail:
	hfi1_put_txreq(ps->s_txreq);

bail_no_tx:
	ps->s_txreq = NULL;
	qp->s_flags &= ~RVT_S_BUSY;
	/*
	 * If we didn't get a txreq, the QP will be woken up later to try
	 * again. Set the flags to indicate which work item to wake
	 * up.
	 */
	iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
	return 0;
}