verbs.c 34 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387
  1. /*
  2. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
  3. *
  4. * This software is available to you under a choice of one of two
  5. * licenses. You may choose to be licensed under the terms of the GNU
  6. * General Public License (GPL) Version 2, available from the file
  7. * COPYING in the main directory of this source tree, or the BSD-type
  8. * license below:
  9. *
  10. * Redistribution and use in source and binary forms, with or without
  11. * modification, are permitted provided that the following conditions
  12. * are met:
  13. *
  14. * Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. *
  17. * Redistributions in binary form must reproduce the above
  18. * copyright notice, this list of conditions and the following
  19. * disclaimer in the documentation and/or other materials provided
  20. * with the distribution.
  21. *
  22. * Neither the name of the Network Appliance, Inc. nor the names of
  23. * its contributors may be used to endorse or promote products
  24. * derived from this software without specific prior written
  25. * permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  30. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  31. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  32. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  33. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  34. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  35. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  36. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  37. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38. */
  39. /*
  40. * verbs.c
  41. *
  42. * Encapsulates the major functions managing:
  43. * o adapters
  44. * o endpoints
  45. * o connections
  46. * o buffer memory
  47. */
  48. #include <linux/interrupt.h>
  49. #include <linux/slab.h>
  50. #include <linux/prefetch.h>
  51. #include <linux/sunrpc/addr.h>
  52. #include <asm/bitops.h>
  53. #include <linux/module.h> /* try_module_get()/module_put() */
  54. #include "xprt_rdma.h"
  55. /*
  56. * Globals/Macros
  57. */
  58. #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
  59. # define RPCDBG_FACILITY RPCDBG_TRANS
  60. #endif
  61. /*
  62. * internal functions
  63. */
  64. static struct workqueue_struct *rpcrdma_receive_wq;
  65. int
  66. rpcrdma_alloc_wq(void)
  67. {
  68. struct workqueue_struct *recv_wq;
  69. recv_wq = alloc_workqueue("xprtrdma_receive",
  70. WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_HIGHPRI,
  71. 0);
  72. if (!recv_wq)
  73. return -ENOMEM;
  74. rpcrdma_receive_wq = recv_wq;
  75. return 0;
  76. }
  77. void
  78. rpcrdma_destroy_wq(void)
  79. {
  80. struct workqueue_struct *wq;
  81. if (rpcrdma_receive_wq) {
  82. wq = rpcrdma_receive_wq;
  83. rpcrdma_receive_wq = NULL;
  84. destroy_workqueue(wq);
  85. }
  86. }
  87. static void
  88. rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
  89. {
  90. struct rpcrdma_ep *ep = context;
  91. pr_err("RPC: %s: %s on device %s ep %p\n",
  92. __func__, ib_event_msg(event->event),
  93. event->device->name, context);
  94. if (ep->rep_connected == 1) {
  95. ep->rep_connected = -EIO;
  96. rpcrdma_conn_func(ep);
  97. wake_up_all(&ep->rep_connect_wait);
  98. }
  99. }
  100. static void
  101. rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
  102. {
  103. struct rpcrdma_ep *ep = context;
  104. pr_err("RPC: %s: %s on device %s ep %p\n",
  105. __func__, ib_event_msg(event->event),
  106. event->device->name, context);
  107. if (ep->rep_connected == 1) {
  108. ep->rep_connected = -EIO;
  109. rpcrdma_conn_func(ep);
  110. wake_up_all(&ep->rep_connect_wait);
  111. }
  112. }
  113. static void
  114. rpcrdma_sendcq_process_wc(struct ib_wc *wc)
  115. {
  116. /* WARNING: Only wr_id and status are reliable at this point */
  117. if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) {
  118. if (wc->status != IB_WC_SUCCESS &&
  119. wc->status != IB_WC_WR_FLUSH_ERR)
  120. pr_err("RPC: %s: SEND: %s\n",
  121. __func__, ib_wc_status_msg(wc->status));
  122. } else {
  123. struct rpcrdma_mw *r;
  124. r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
  125. r->mw_sendcompletion(wc);
  126. }
  127. }
  128. /* The common case is a single send completion is waiting. By
  129. * passing two WC entries to ib_poll_cq, a return code of 1
  130. * means there is exactly one WC waiting and no more. We don't
  131. * have to invoke ib_poll_cq again to know that the CQ has been
  132. * properly drained.
  133. */
  134. static void
  135. rpcrdma_sendcq_poll(struct ib_cq *cq)
  136. {
  137. struct ib_wc *pos, wcs[2];
  138. int count, rc;
  139. do {
  140. pos = wcs;
  141. rc = ib_poll_cq(cq, ARRAY_SIZE(wcs), pos);
  142. if (rc < 0)
  143. break;
  144. count = rc;
  145. while (count-- > 0)
  146. rpcrdma_sendcq_process_wc(pos++);
  147. } while (rc == ARRAY_SIZE(wcs));
  148. return;
  149. }
  150. /* Handle provider send completion upcalls.
  151. */
  152. static void
  153. rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
  154. {
  155. do {
  156. rpcrdma_sendcq_poll(cq);
  157. } while (ib_req_notify_cq(cq, IB_CQ_NEXT_COMP |
  158. IB_CQ_REPORT_MISSED_EVENTS) > 0);
  159. }
  160. static void
  161. rpcrdma_receive_worker(struct work_struct *work)
  162. {
  163. struct rpcrdma_rep *rep =
  164. container_of(work, struct rpcrdma_rep, rr_work);
  165. rpcrdma_reply_handler(rep);
  166. }
  167. static void
  168. rpcrdma_recvcq_process_wc(struct ib_wc *wc)
  169. {
  170. struct rpcrdma_rep *rep =
  171. (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
  172. /* WARNING: Only wr_id and status are reliable at this point */
  173. if (wc->status != IB_WC_SUCCESS)
  174. goto out_fail;
  175. /* status == SUCCESS means all fields in wc are trustworthy */
  176. if (wc->opcode != IB_WC_RECV)
  177. return;
  178. dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
  179. __func__, rep, wc->byte_len);
  180. rep->rr_len = wc->byte_len;
  181. ib_dma_sync_single_for_cpu(rep->rr_device,
  182. rdmab_addr(rep->rr_rdmabuf),
  183. rep->rr_len, DMA_FROM_DEVICE);
  184. prefetch(rdmab_to_msg(rep->rr_rdmabuf));
  185. out_schedule:
  186. queue_work(rpcrdma_receive_wq, &rep->rr_work);
  187. return;
  188. out_fail:
  189. if (wc->status != IB_WC_WR_FLUSH_ERR)
  190. pr_err("RPC: %s: rep %p: %s\n",
  191. __func__, rep, ib_wc_status_msg(wc->status));
  192. rep->rr_len = RPCRDMA_BAD_LEN;
  193. goto out_schedule;
  194. }
  195. /* The wc array is on stack: automatic memory is always CPU-local.
  196. *
  197. * struct ib_wc is 64 bytes, making the poll array potentially
  198. * large. But this is at the bottom of the call chain. Further
  199. * substantial work is done in another thread.
  200. */
  201. static void
  202. rpcrdma_recvcq_poll(struct ib_cq *cq)
  203. {
  204. struct ib_wc *pos, wcs[4];
  205. int count, rc;
  206. do {
  207. pos = wcs;
  208. rc = ib_poll_cq(cq, ARRAY_SIZE(wcs), pos);
  209. if (rc < 0)
  210. break;
  211. count = rc;
  212. while (count-- > 0)
  213. rpcrdma_recvcq_process_wc(pos++);
  214. } while (rc == ARRAY_SIZE(wcs));
  215. }
  216. /* Handle provider receive completion upcalls.
  217. */
  218. static void
  219. rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
  220. {
  221. do {
  222. rpcrdma_recvcq_poll(cq);
  223. } while (ib_req_notify_cq(cq, IB_CQ_NEXT_COMP |
  224. IB_CQ_REPORT_MISSED_EVENTS) > 0);
  225. }
  226. static void
  227. rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
  228. {
  229. struct ib_wc wc;
  230. while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
  231. rpcrdma_recvcq_process_wc(&wc);
  232. while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0)
  233. rpcrdma_sendcq_process_wc(&wc);
  234. }
  235. static int
  236. rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
  237. {
  238. struct rpcrdma_xprt *xprt = id->context;
  239. struct rpcrdma_ia *ia = &xprt->rx_ia;
  240. struct rpcrdma_ep *ep = &xprt->rx_ep;
  241. #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
  242. struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
  243. #endif
  244. struct ib_qp_attr *attr = &ia->ri_qp_attr;
  245. struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
  246. int connstate = 0;
  247. switch (event->event) {
  248. case RDMA_CM_EVENT_ADDR_RESOLVED:
  249. case RDMA_CM_EVENT_ROUTE_RESOLVED:
  250. ia->ri_async_rc = 0;
  251. complete(&ia->ri_done);
  252. break;
  253. case RDMA_CM_EVENT_ADDR_ERROR:
  254. ia->ri_async_rc = -EHOSTUNREACH;
  255. dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
  256. __func__, ep);
  257. complete(&ia->ri_done);
  258. break;
  259. case RDMA_CM_EVENT_ROUTE_ERROR:
  260. ia->ri_async_rc = -ENETUNREACH;
  261. dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
  262. __func__, ep);
  263. complete(&ia->ri_done);
  264. break;
  265. case RDMA_CM_EVENT_ESTABLISHED:
  266. connstate = 1;
  267. ib_query_qp(ia->ri_id->qp, attr,
  268. IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
  269. iattr);
  270. dprintk("RPC: %s: %d responder resources"
  271. " (%d initiator)\n",
  272. __func__, attr->max_dest_rd_atomic,
  273. attr->max_rd_atomic);
  274. goto connected;
  275. case RDMA_CM_EVENT_CONNECT_ERROR:
  276. connstate = -ENOTCONN;
  277. goto connected;
  278. case RDMA_CM_EVENT_UNREACHABLE:
  279. connstate = -ENETDOWN;
  280. goto connected;
  281. case RDMA_CM_EVENT_REJECTED:
  282. connstate = -ECONNREFUSED;
  283. goto connected;
  284. case RDMA_CM_EVENT_DISCONNECTED:
  285. connstate = -ECONNABORTED;
  286. goto connected;
  287. case RDMA_CM_EVENT_DEVICE_REMOVAL:
  288. connstate = -ENODEV;
  289. connected:
  290. dprintk("RPC: %s: %sconnected\n",
  291. __func__, connstate > 0 ? "" : "dis");
  292. ep->rep_connected = connstate;
  293. rpcrdma_conn_func(ep);
  294. wake_up_all(&ep->rep_connect_wait);
  295. /*FALLTHROUGH*/
  296. default:
  297. dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n",
  298. __func__, sap, rpc_get_port(sap), ep,
  299. rdma_event_msg(event->event));
  300. break;
  301. }
  302. #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
  303. if (connstate == 1) {
  304. int ird = attr->max_dest_rd_atomic;
  305. int tird = ep->rep_remote_cma.responder_resources;
  306. pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
  307. sap, rpc_get_port(sap),
  308. ia->ri_device->name,
  309. ia->ri_ops->ro_displayname,
  310. xprt->rx_buf.rb_max_requests,
  311. ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
  312. } else if (connstate < 0) {
  313. pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n",
  314. sap, rpc_get_port(sap), connstate);
  315. }
  316. #endif
  317. return 0;
  318. }
  319. static void rpcrdma_destroy_id(struct rdma_cm_id *id)
  320. {
  321. if (id) {
  322. module_put(id->device->owner);
  323. rdma_destroy_id(id);
  324. }
  325. }
  326. static struct rdma_cm_id *
  327. rpcrdma_create_id(struct rpcrdma_xprt *xprt,
  328. struct rpcrdma_ia *ia, struct sockaddr *addr)
  329. {
  330. struct rdma_cm_id *id;
  331. int rc;
  332. init_completion(&ia->ri_done);
  333. id = rdma_create_id(&init_net, rpcrdma_conn_upcall, xprt, RDMA_PS_TCP,
  334. IB_QPT_RC);
  335. if (IS_ERR(id)) {
  336. rc = PTR_ERR(id);
  337. dprintk("RPC: %s: rdma_create_id() failed %i\n",
  338. __func__, rc);
  339. return id;
  340. }
  341. ia->ri_async_rc = -ETIMEDOUT;
  342. rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
  343. if (rc) {
  344. dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
  345. __func__, rc);
  346. goto out;
  347. }
  348. wait_for_completion_interruptible_timeout(&ia->ri_done,
  349. msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
  350. /* FIXME:
  351. * Until xprtrdma supports DEVICE_REMOVAL, the provider must
  352. * be pinned while there are active NFS/RDMA mounts to prevent
  353. * hangs and crashes at umount time.
  354. */
  355. if (!ia->ri_async_rc && !try_module_get(id->device->owner)) {
  356. dprintk("RPC: %s: Failed to get device module\n",
  357. __func__);
  358. ia->ri_async_rc = -ENODEV;
  359. }
  360. rc = ia->ri_async_rc;
  361. if (rc)
  362. goto out;
  363. ia->ri_async_rc = -ETIMEDOUT;
  364. rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
  365. if (rc) {
  366. dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
  367. __func__, rc);
  368. goto put;
  369. }
  370. wait_for_completion_interruptible_timeout(&ia->ri_done,
  371. msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
  372. rc = ia->ri_async_rc;
  373. if (rc)
  374. goto put;
  375. return id;
  376. put:
  377. module_put(id->device->owner);
  378. out:
  379. rdma_destroy_id(id);
  380. return ERR_PTR(rc);
  381. }
  382. /*
  383. * Drain any cq, prior to teardown.
  384. */
  385. static void
  386. rpcrdma_clean_cq(struct ib_cq *cq)
  387. {
  388. struct ib_wc wc;
  389. int count = 0;
  390. while (1 == ib_poll_cq(cq, 1, &wc))
  391. ++count;
  392. if (count)
  393. dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
  394. __func__, count, wc.opcode);
  395. }
  396. /*
  397. * Exported functions.
  398. */
  399. /*
  400. * Open and initialize an Interface Adapter.
  401. * o initializes fields of struct rpcrdma_ia, including
  402. * interface and provider attributes and protection zone.
  403. */
  404. int
  405. rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
  406. {
  407. struct rpcrdma_ia *ia = &xprt->rx_ia;
  408. struct ib_device_attr *devattr = &ia->ri_devattr;
  409. int rc;
  410. ia->ri_dma_mr = NULL;
  411. ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
  412. if (IS_ERR(ia->ri_id)) {
  413. rc = PTR_ERR(ia->ri_id);
  414. goto out1;
  415. }
  416. ia->ri_device = ia->ri_id->device;
  417. ia->ri_pd = ib_alloc_pd(ia->ri_device);
  418. if (IS_ERR(ia->ri_pd)) {
  419. rc = PTR_ERR(ia->ri_pd);
  420. dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
  421. __func__, rc);
  422. goto out2;
  423. }
  424. rc = ib_query_device(ia->ri_device, devattr);
  425. if (rc) {
  426. dprintk("RPC: %s: ib_query_device failed %d\n",
  427. __func__, rc);
  428. goto out3;
  429. }
  430. if (memreg == RPCRDMA_FRMR) {
  431. if (!(devattr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) ||
  432. (devattr->max_fast_reg_page_list_len == 0)) {
  433. dprintk("RPC: %s: FRMR registration "
  434. "not supported by HCA\n", __func__);
  435. memreg = RPCRDMA_MTHCAFMR;
  436. }
  437. }
  438. if (memreg == RPCRDMA_MTHCAFMR) {
  439. if (!ia->ri_device->alloc_fmr) {
  440. dprintk("RPC: %s: MTHCAFMR registration "
  441. "not supported by HCA\n", __func__);
  442. rc = -EINVAL;
  443. goto out3;
  444. }
  445. }
  446. switch (memreg) {
  447. case RPCRDMA_FRMR:
  448. ia->ri_ops = &rpcrdma_frwr_memreg_ops;
  449. break;
  450. case RPCRDMA_ALLPHYSICAL:
  451. ia->ri_ops = &rpcrdma_physical_memreg_ops;
  452. break;
  453. case RPCRDMA_MTHCAFMR:
  454. ia->ri_ops = &rpcrdma_fmr_memreg_ops;
  455. break;
  456. default:
  457. printk(KERN_ERR "RPC: Unsupported memory "
  458. "registration mode: %d\n", memreg);
  459. rc = -ENOMEM;
  460. goto out3;
  461. }
  462. dprintk("RPC: %s: memory registration strategy is '%s'\n",
  463. __func__, ia->ri_ops->ro_displayname);
  464. rwlock_init(&ia->ri_qplock);
  465. return 0;
  466. out3:
  467. ib_dealloc_pd(ia->ri_pd);
  468. ia->ri_pd = NULL;
  469. out2:
  470. rpcrdma_destroy_id(ia->ri_id);
  471. ia->ri_id = NULL;
  472. out1:
  473. return rc;
  474. }
  475. /*
  476. * Clean up/close an IA.
  477. * o if event handles and PD have been initialized, free them.
  478. * o close the IA
  479. */
  480. void
  481. rpcrdma_ia_close(struct rpcrdma_ia *ia)
  482. {
  483. dprintk("RPC: %s: entering\n", __func__);
  484. if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
  485. if (ia->ri_id->qp)
  486. rdma_destroy_qp(ia->ri_id);
  487. rpcrdma_destroy_id(ia->ri_id);
  488. ia->ri_id = NULL;
  489. }
  490. /* If the pd is still busy, xprtrdma missed freeing a resource */
  491. if (ia->ri_pd && !IS_ERR(ia->ri_pd))
  492. ib_dealloc_pd(ia->ri_pd);
  493. }
  494. /*
  495. * Create unconnected endpoint.
  496. */
  497. int
  498. rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
  499. struct rpcrdma_create_data_internal *cdata)
  500. {
  501. struct ib_device_attr *devattr = &ia->ri_devattr;
  502. struct ib_cq *sendcq, *recvcq;
  503. struct ib_cq_init_attr cq_attr = {};
  504. unsigned int max_qp_wr;
  505. int rc, err;
  506. if (devattr->max_sge < RPCRDMA_MAX_IOVS) {
  507. dprintk("RPC: %s: insufficient sge's available\n",
  508. __func__);
  509. return -ENOMEM;
  510. }
  511. if (devattr->max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
  512. dprintk("RPC: %s: insufficient wqe's available\n",
  513. __func__);
  514. return -ENOMEM;
  515. }
  516. max_qp_wr = devattr->max_qp_wr - RPCRDMA_BACKWARD_WRS;
  517. /* check provider's send/recv wr limits */
  518. if (cdata->max_requests > max_qp_wr)
  519. cdata->max_requests = max_qp_wr;
  520. ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
  521. ep->rep_attr.qp_context = ep;
  522. ep->rep_attr.srq = NULL;
  523. ep->rep_attr.cap.max_send_wr = cdata->max_requests;
  524. ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
  525. rc = ia->ri_ops->ro_open(ia, ep, cdata);
  526. if (rc)
  527. return rc;
  528. ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
  529. ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
  530. ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
  531. ep->rep_attr.cap.max_recv_sge = 1;
  532. ep->rep_attr.cap.max_inline_data = 0;
  533. ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
  534. ep->rep_attr.qp_type = IB_QPT_RC;
  535. ep->rep_attr.port_num = ~0;
  536. dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
  537. "iovs: send %d recv %d\n",
  538. __func__,
  539. ep->rep_attr.cap.max_send_wr,
  540. ep->rep_attr.cap.max_recv_wr,
  541. ep->rep_attr.cap.max_send_sge,
  542. ep->rep_attr.cap.max_recv_sge);
  543. /* set trigger for requesting send completion */
  544. ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
  545. if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS)
  546. ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS;
  547. else if (ep->rep_cqinit <= 2)
  548. ep->rep_cqinit = 0;
  549. INIT_CQCOUNT(ep);
  550. init_waitqueue_head(&ep->rep_connect_wait);
  551. INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
  552. cq_attr.cqe = ep->rep_attr.cap.max_send_wr + 1;
  553. sendcq = ib_create_cq(ia->ri_device, rpcrdma_sendcq_upcall,
  554. rpcrdma_cq_async_error_upcall, NULL, &cq_attr);
  555. if (IS_ERR(sendcq)) {
  556. rc = PTR_ERR(sendcq);
  557. dprintk("RPC: %s: failed to create send CQ: %i\n",
  558. __func__, rc);
  559. goto out1;
  560. }
  561. rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
  562. if (rc) {
  563. dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
  564. __func__, rc);
  565. goto out2;
  566. }
  567. cq_attr.cqe = ep->rep_attr.cap.max_recv_wr + 1;
  568. recvcq = ib_create_cq(ia->ri_device, rpcrdma_recvcq_upcall,
  569. rpcrdma_cq_async_error_upcall, NULL, &cq_attr);
  570. if (IS_ERR(recvcq)) {
  571. rc = PTR_ERR(recvcq);
  572. dprintk("RPC: %s: failed to create recv CQ: %i\n",
  573. __func__, rc);
  574. goto out2;
  575. }
  576. rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
  577. if (rc) {
  578. dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
  579. __func__, rc);
  580. ib_destroy_cq(recvcq);
  581. goto out2;
  582. }
  583. ep->rep_attr.send_cq = sendcq;
  584. ep->rep_attr.recv_cq = recvcq;
  585. /* Initialize cma parameters */
  586. /* RPC/RDMA does not use private data */
  587. ep->rep_remote_cma.private_data = NULL;
  588. ep->rep_remote_cma.private_data_len = 0;
  589. /* Client offers RDMA Read but does not initiate */
  590. ep->rep_remote_cma.initiator_depth = 0;
  591. if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */
  592. ep->rep_remote_cma.responder_resources = 32;
  593. else
  594. ep->rep_remote_cma.responder_resources =
  595. devattr->max_qp_rd_atom;
  596. ep->rep_remote_cma.retry_count = 7;
  597. ep->rep_remote_cma.flow_control = 0;
  598. ep->rep_remote_cma.rnr_retry_count = 0;
  599. return 0;
  600. out2:
  601. err = ib_destroy_cq(sendcq);
  602. if (err)
  603. dprintk("RPC: %s: ib_destroy_cq returned %i\n",
  604. __func__, err);
  605. out1:
  606. if (ia->ri_dma_mr)
  607. ib_dereg_mr(ia->ri_dma_mr);
  608. return rc;
  609. }
  610. /*
  611. * rpcrdma_ep_destroy
  612. *
  613. * Disconnect and destroy endpoint. After this, the only
  614. * valid operations on the ep are to free it (if dynamically
  615. * allocated) or re-create it.
  616. */
  617. void
  618. rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
  619. {
  620. int rc;
  621. dprintk("RPC: %s: entering, connected is %d\n",
  622. __func__, ep->rep_connected);
  623. cancel_delayed_work_sync(&ep->rep_connect_worker);
  624. if (ia->ri_id->qp)
  625. rpcrdma_ep_disconnect(ep, ia);
  626. rpcrdma_clean_cq(ep->rep_attr.recv_cq);
  627. rpcrdma_clean_cq(ep->rep_attr.send_cq);
  628. if (ia->ri_id->qp) {
  629. rdma_destroy_qp(ia->ri_id);
  630. ia->ri_id->qp = NULL;
  631. }
  632. rc = ib_destroy_cq(ep->rep_attr.recv_cq);
  633. if (rc)
  634. dprintk("RPC: %s: ib_destroy_cq returned %i\n",
  635. __func__, rc);
  636. rc = ib_destroy_cq(ep->rep_attr.send_cq);
  637. if (rc)
  638. dprintk("RPC: %s: ib_destroy_cq returned %i\n",
  639. __func__, rc);
  640. if (ia->ri_dma_mr) {
  641. rc = ib_dereg_mr(ia->ri_dma_mr);
  642. dprintk("RPC: %s: ib_dereg_mr returned %i\n",
  643. __func__, rc);
  644. }
  645. }
  646. /*
  647. * Connect unconnected endpoint.
  648. */
  649. int
  650. rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
  651. {
  652. struct rdma_cm_id *id, *old;
  653. int rc = 0;
  654. int retry_count = 0;
  655. if (ep->rep_connected != 0) {
  656. struct rpcrdma_xprt *xprt;
  657. retry:
  658. dprintk("RPC: %s: reconnecting...\n", __func__);
  659. rpcrdma_ep_disconnect(ep, ia);
  660. rpcrdma_flush_cqs(ep);
  661. xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
  662. id = rpcrdma_create_id(xprt, ia,
  663. (struct sockaddr *)&xprt->rx_data.addr);
  664. if (IS_ERR(id)) {
  665. rc = -EHOSTUNREACH;
  666. goto out;
  667. }
  668. /* TEMP TEMP TEMP - fail if new device:
  669. * Deregister/remarshal *all* requests!
  670. * Close and recreate adapter, pd, etc!
  671. * Re-determine all attributes still sane!
  672. * More stuff I haven't thought of!
  673. * Rrrgh!
  674. */
  675. if (ia->ri_device != id->device) {
  676. printk("RPC: %s: can't reconnect on "
  677. "different device!\n", __func__);
  678. rpcrdma_destroy_id(id);
  679. rc = -ENETUNREACH;
  680. goto out;
  681. }
  682. /* END TEMP */
  683. rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
  684. if (rc) {
  685. dprintk("RPC: %s: rdma_create_qp failed %i\n",
  686. __func__, rc);
  687. rpcrdma_destroy_id(id);
  688. rc = -ENETUNREACH;
  689. goto out;
  690. }
  691. write_lock(&ia->ri_qplock);
  692. old = ia->ri_id;
  693. ia->ri_id = id;
  694. write_unlock(&ia->ri_qplock);
  695. rdma_destroy_qp(old);
  696. rpcrdma_destroy_id(old);
  697. } else {
  698. dprintk("RPC: %s: connecting...\n", __func__);
  699. rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
  700. if (rc) {
  701. dprintk("RPC: %s: rdma_create_qp failed %i\n",
  702. __func__, rc);
  703. /* do not update ep->rep_connected */
  704. return -ENETUNREACH;
  705. }
  706. }
  707. ep->rep_connected = 0;
  708. rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
  709. if (rc) {
  710. dprintk("RPC: %s: rdma_connect() failed with %i\n",
  711. __func__, rc);
  712. goto out;
  713. }
  714. wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
  715. /*
  716. * Check state. A non-peer reject indicates no listener
  717. * (ECONNREFUSED), which may be a transient state. All
  718. * others indicate a transport condition which has already
  719. * undergone a best-effort.
  720. */
  721. if (ep->rep_connected == -ECONNREFUSED &&
  722. ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
  723. dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
  724. goto retry;
  725. }
  726. if (ep->rep_connected <= 0) {
  727. /* Sometimes, the only way to reliably connect to remote
  728. * CMs is to use same nonzero values for ORD and IRD. */
  729. if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
  730. (ep->rep_remote_cma.responder_resources == 0 ||
  731. ep->rep_remote_cma.initiator_depth !=
  732. ep->rep_remote_cma.responder_resources)) {
  733. if (ep->rep_remote_cma.responder_resources == 0)
  734. ep->rep_remote_cma.responder_resources = 1;
  735. ep->rep_remote_cma.initiator_depth =
  736. ep->rep_remote_cma.responder_resources;
  737. goto retry;
  738. }
  739. rc = ep->rep_connected;
  740. } else {
  741. struct rpcrdma_xprt *r_xprt;
  742. unsigned int extras;
  743. dprintk("RPC: %s: connected\n", __func__);
  744. r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
  745. extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
  746. if (extras) {
  747. rc = rpcrdma_ep_post_extra_recv(r_xprt, extras);
  748. if (rc)
  749. pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n",
  750. __func__, rc);
  751. rc = 0;
  752. }
  753. }
  754. out:
  755. if (rc)
  756. ep->rep_connected = rc;
  757. return rc;
  758. }
  759. /*
  760. * rpcrdma_ep_disconnect
  761. *
  762. * This is separate from destroy to facilitate the ability
  763. * to reconnect without recreating the endpoint.
  764. *
  765. * This call is not reentrant, and must not be made in parallel
  766. * on the same endpoint.
  767. */
  768. void
  769. rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
  770. {
  771. int rc;
  772. rpcrdma_flush_cqs(ep);
  773. rc = rdma_disconnect(ia->ri_id);
  774. if (!rc) {
  775. /* returns without wait if not connected */
  776. wait_event_interruptible(ep->rep_connect_wait,
  777. ep->rep_connected != 1);
  778. dprintk("RPC: %s: after wait, %sconnected\n", __func__,
  779. (ep->rep_connected == 1) ? "still " : "dis");
  780. } else {
  781. dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
  782. ep->rep_connected = rc;
  783. }
  784. }
  785. struct rpcrdma_req *
  786. rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
  787. {
  788. struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
  789. struct rpcrdma_req *req;
  790. req = kzalloc(sizeof(*req), GFP_KERNEL);
  791. if (req == NULL)
  792. return ERR_PTR(-ENOMEM);
  793. INIT_LIST_HEAD(&req->rl_free);
  794. spin_lock(&buffer->rb_reqslock);
  795. list_add(&req->rl_all, &buffer->rb_allreqs);
  796. spin_unlock(&buffer->rb_reqslock);
  797. req->rl_buffer = &r_xprt->rx_buf;
  798. return req;
  799. }
  800. struct rpcrdma_rep *
  801. rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
  802. {
  803. struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
  804. struct rpcrdma_ia *ia = &r_xprt->rx_ia;
  805. struct rpcrdma_rep *rep;
  806. int rc;
  807. rc = -ENOMEM;
  808. rep = kzalloc(sizeof(*rep), GFP_KERNEL);
  809. if (rep == NULL)
  810. goto out;
  811. rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize,
  812. GFP_KERNEL);
  813. if (IS_ERR(rep->rr_rdmabuf)) {
  814. rc = PTR_ERR(rep->rr_rdmabuf);
  815. goto out_free;
  816. }
  817. rep->rr_device = ia->ri_device;
  818. rep->rr_rxprt = r_xprt;
  819. INIT_WORK(&rep->rr_work, rpcrdma_receive_worker);
  820. return rep;
  821. out_free:
  822. kfree(rep);
  823. out:
  824. return ERR_PTR(rc);
  825. }
  826. int
  827. rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
  828. {
  829. struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
  830. struct rpcrdma_ia *ia = &r_xprt->rx_ia;
  831. int i, rc;
  832. buf->rb_max_requests = r_xprt->rx_data.max_requests;
  833. buf->rb_bc_srv_max_requests = 0;
  834. spin_lock_init(&buf->rb_lock);
  835. rc = ia->ri_ops->ro_init(r_xprt);
  836. if (rc)
  837. goto out;
  838. INIT_LIST_HEAD(&buf->rb_send_bufs);
  839. INIT_LIST_HEAD(&buf->rb_allreqs);
  840. spin_lock_init(&buf->rb_reqslock);
  841. for (i = 0; i < buf->rb_max_requests; i++) {
  842. struct rpcrdma_req *req;
  843. req = rpcrdma_create_req(r_xprt);
  844. if (IS_ERR(req)) {
  845. dprintk("RPC: %s: request buffer %d alloc"
  846. " failed\n", __func__, i);
  847. rc = PTR_ERR(req);
  848. goto out;
  849. }
  850. req->rl_backchannel = false;
  851. list_add(&req->rl_free, &buf->rb_send_bufs);
  852. }
  853. INIT_LIST_HEAD(&buf->rb_recv_bufs);
  854. for (i = 0; i < buf->rb_max_requests + 2; i++) {
  855. struct rpcrdma_rep *rep;
  856. rep = rpcrdma_create_rep(r_xprt);
  857. if (IS_ERR(rep)) {
  858. dprintk("RPC: %s: reply buffer %d alloc failed\n",
  859. __func__, i);
  860. rc = PTR_ERR(rep);
  861. goto out;
  862. }
  863. list_add(&rep->rr_list, &buf->rb_recv_bufs);
  864. }
  865. return 0;
  866. out:
  867. rpcrdma_buffer_destroy(buf);
  868. return rc;
  869. }
  870. static struct rpcrdma_req *
  871. rpcrdma_buffer_get_req_locked(struct rpcrdma_buffer *buf)
  872. {
  873. struct rpcrdma_req *req;
  874. req = list_first_entry(&buf->rb_send_bufs,
  875. struct rpcrdma_req, rl_free);
  876. list_del(&req->rl_free);
  877. return req;
  878. }
  879. static struct rpcrdma_rep *
  880. rpcrdma_buffer_get_rep_locked(struct rpcrdma_buffer *buf)
  881. {
  882. struct rpcrdma_rep *rep;
  883. rep = list_first_entry(&buf->rb_recv_bufs,
  884. struct rpcrdma_rep, rr_list);
  885. list_del(&rep->rr_list);
  886. return rep;
  887. }
  888. static void
  889. rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
  890. {
  891. rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
  892. kfree(rep);
  893. }
  894. void
  895. rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
  896. {
  897. rpcrdma_free_regbuf(ia, req->rl_sendbuf);
  898. rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
  899. kfree(req);
  900. }
  901. void
  902. rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
  903. {
  904. struct rpcrdma_ia *ia = rdmab_to_ia(buf);
  905. while (!list_empty(&buf->rb_recv_bufs)) {
  906. struct rpcrdma_rep *rep;
  907. rep = rpcrdma_buffer_get_rep_locked(buf);
  908. rpcrdma_destroy_rep(ia, rep);
  909. }
  910. spin_lock(&buf->rb_reqslock);
  911. while (!list_empty(&buf->rb_allreqs)) {
  912. struct rpcrdma_req *req;
  913. req = list_first_entry(&buf->rb_allreqs,
  914. struct rpcrdma_req, rl_all);
  915. list_del(&req->rl_all);
  916. spin_unlock(&buf->rb_reqslock);
  917. rpcrdma_destroy_req(ia, req);
  918. spin_lock(&buf->rb_reqslock);
  919. }
  920. spin_unlock(&buf->rb_reqslock);
  921. ia->ri_ops->ro_destroy(buf);
  922. }
  923. struct rpcrdma_mw *
  924. rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
  925. {
  926. struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
  927. struct rpcrdma_mw *mw = NULL;
  928. spin_lock(&buf->rb_mwlock);
  929. if (!list_empty(&buf->rb_mws)) {
  930. mw = list_first_entry(&buf->rb_mws,
  931. struct rpcrdma_mw, mw_list);
  932. list_del_init(&mw->mw_list);
  933. }
  934. spin_unlock(&buf->rb_mwlock);
  935. if (!mw)
  936. pr_err("RPC: %s: no MWs available\n", __func__);
  937. return mw;
  938. }
  939. void
  940. rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
  941. {
  942. struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
  943. spin_lock(&buf->rb_mwlock);
  944. list_add_tail(&mw->mw_list, &buf->rb_mws);
  945. spin_unlock(&buf->rb_mwlock);
  946. }
  947. /*
  948. * Get a set of request/reply buffers.
  949. *
  950. * Reply buffer (if available) is attached to send buffer upon return.
  951. */
  952. struct rpcrdma_req *
  953. rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
  954. {
  955. struct rpcrdma_req *req;
  956. spin_lock(&buffers->rb_lock);
  957. if (list_empty(&buffers->rb_send_bufs))
  958. goto out_reqbuf;
  959. req = rpcrdma_buffer_get_req_locked(buffers);
  960. if (list_empty(&buffers->rb_recv_bufs))
  961. goto out_repbuf;
  962. req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers);
  963. spin_unlock(&buffers->rb_lock);
  964. return req;
  965. out_reqbuf:
  966. spin_unlock(&buffers->rb_lock);
  967. pr_warn("RPC: %s: out of request buffers\n", __func__);
  968. return NULL;
  969. out_repbuf:
  970. spin_unlock(&buffers->rb_lock);
  971. pr_warn("RPC: %s: out of reply buffers\n", __func__);
  972. req->rl_reply = NULL;
  973. return req;
  974. }
  975. /*
  976. * Put request/reply buffers back into pool.
  977. * Pre-decrement counter/array index.
  978. */
  979. void
  980. rpcrdma_buffer_put(struct rpcrdma_req *req)
  981. {
  982. struct rpcrdma_buffer *buffers = req->rl_buffer;
  983. struct rpcrdma_rep *rep = req->rl_reply;
  984. req->rl_niovs = 0;
  985. req->rl_reply = NULL;
  986. spin_lock(&buffers->rb_lock);
  987. list_add_tail(&req->rl_free, &buffers->rb_send_bufs);
  988. if (rep)
  989. list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
  990. spin_unlock(&buffers->rb_lock);
  991. }
  992. /*
  993. * Recover reply buffers from pool.
  994. * This happens when recovering from disconnect.
  995. */
  996. void
  997. rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
  998. {
  999. struct rpcrdma_buffer *buffers = req->rl_buffer;
  1000. spin_lock(&buffers->rb_lock);
  1001. if (!list_empty(&buffers->rb_recv_bufs))
  1002. req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers);
  1003. spin_unlock(&buffers->rb_lock);
  1004. }
  1005. /*
  1006. * Put reply buffers back into pool when not attached to
  1007. * request. This happens in error conditions.
  1008. */
  1009. void
  1010. rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
  1011. {
  1012. struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
  1013. spin_lock(&buffers->rb_lock);
  1014. list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
  1015. spin_unlock(&buffers->rb_lock);
  1016. }
  1017. /*
  1018. * Wrappers for internal-use kmalloc memory registration, used by buffer code.
  1019. */
  1020. void
  1021. rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
  1022. {
  1023. dprintk("RPC: map_one: offset %p iova %llx len %zu\n",
  1024. seg->mr_offset,
  1025. (unsigned long long)seg->mr_dma, seg->mr_dmalen);
  1026. }
  1027. /**
  1028. * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
  1029. * @ia: controlling rpcrdma_ia
  1030. * @size: size of buffer to be allocated, in bytes
  1031. * @flags: GFP flags
  1032. *
  1033. * Returns pointer to private header of an area of internally
  1034. * registered memory, or an ERR_PTR. The registered buffer follows
  1035. * the end of the private header.
  1036. *
  1037. * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
  1038. * receiving the payload of RDMA RECV operations. regbufs are not
  1039. * used for RDMA READ/WRITE operations, thus are registered only for
  1040. * LOCAL access.
  1041. */
  1042. struct rpcrdma_regbuf *
  1043. rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
  1044. {
  1045. struct rpcrdma_regbuf *rb;
  1046. struct ib_sge *iov;
  1047. rb = kmalloc(sizeof(*rb) + size, flags);
  1048. if (rb == NULL)
  1049. goto out;
  1050. iov = &rb->rg_iov;
  1051. iov->addr = ib_dma_map_single(ia->ri_device,
  1052. (void *)rb->rg_base, size,
  1053. DMA_BIDIRECTIONAL);
  1054. if (ib_dma_mapping_error(ia->ri_device, iov->addr))
  1055. goto out_free;
  1056. iov->length = size;
  1057. iov->lkey = ia->ri_pd->local_dma_lkey;
  1058. rb->rg_size = size;
  1059. rb->rg_owner = NULL;
  1060. return rb;
  1061. out_free:
  1062. kfree(rb);
  1063. out:
  1064. return ERR_PTR(-ENOMEM);
  1065. }
  1066. /**
  1067. * rpcrdma_free_regbuf - deregister and free registered buffer
  1068. * @ia: controlling rpcrdma_ia
  1069. * @rb: regbuf to be deregistered and freed
  1070. */
  1071. void
  1072. rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
  1073. {
  1074. struct ib_sge *iov;
  1075. if (!rb)
  1076. return;
  1077. iov = &rb->rg_iov;
  1078. ib_dma_unmap_single(ia->ri_device,
  1079. iov->addr, iov->length, DMA_BIDIRECTIONAL);
  1080. kfree(rb);
  1081. }
  1082. /*
  1083. * Prepost any receive buffer, then post send.
  1084. *
  1085. * Receive buffer is donated to hardware, reclaimed upon recv completion.
  1086. */
  1087. int
  1088. rpcrdma_ep_post(struct rpcrdma_ia *ia,
  1089. struct rpcrdma_ep *ep,
  1090. struct rpcrdma_req *req)
  1091. {
  1092. struct ib_device *device = ia->ri_device;
  1093. struct ib_send_wr send_wr, *send_wr_fail;
  1094. struct rpcrdma_rep *rep = req->rl_reply;
  1095. struct ib_sge *iov = req->rl_send_iov;
  1096. int i, rc;
  1097. if (rep) {
  1098. rc = rpcrdma_ep_post_recv(ia, ep, rep);
  1099. if (rc)
  1100. goto out;
  1101. req->rl_reply = NULL;
  1102. }
  1103. send_wr.next = NULL;
  1104. send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
  1105. send_wr.sg_list = iov;
  1106. send_wr.num_sge = req->rl_niovs;
  1107. send_wr.opcode = IB_WR_SEND;
  1108. for (i = 0; i < send_wr.num_sge; i++)
  1109. ib_dma_sync_single_for_device(device, iov[i].addr,
  1110. iov[i].length, DMA_TO_DEVICE);
  1111. dprintk("RPC: %s: posting %d s/g entries\n",
  1112. __func__, send_wr.num_sge);
  1113. if (DECR_CQCOUNT(ep) > 0)
  1114. send_wr.send_flags = 0;
  1115. else { /* Provider must take a send completion every now and then */
  1116. INIT_CQCOUNT(ep);
  1117. send_wr.send_flags = IB_SEND_SIGNALED;
  1118. }
  1119. rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
  1120. if (rc)
  1121. dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
  1122. rc);
  1123. out:
  1124. return rc;
  1125. }
  1126. /*
  1127. * (Re)post a receive buffer.
  1128. */
  1129. int
  1130. rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
  1131. struct rpcrdma_ep *ep,
  1132. struct rpcrdma_rep *rep)
  1133. {
  1134. struct ib_recv_wr recv_wr, *recv_wr_fail;
  1135. int rc;
  1136. recv_wr.next = NULL;
  1137. recv_wr.wr_id = (u64) (unsigned long) rep;
  1138. recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
  1139. recv_wr.num_sge = 1;
  1140. ib_dma_sync_single_for_cpu(ia->ri_device,
  1141. rdmab_addr(rep->rr_rdmabuf),
  1142. rdmab_length(rep->rr_rdmabuf),
  1143. DMA_BIDIRECTIONAL);
  1144. rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
  1145. if (rc)
  1146. dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
  1147. rc);
  1148. return rc;
  1149. }
  1150. /**
  1151. * rpcrdma_ep_post_extra_recv - Post buffers for incoming backchannel requests
  1152. * @r_xprt: transport associated with these backchannel resources
  1153. * @min_reqs: minimum number of incoming requests expected
  1154. *
  1155. * Returns zero if all requested buffers were posted, or a negative errno.
  1156. */
  1157. int
  1158. rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
  1159. {
  1160. struct rpcrdma_buffer *buffers = &r_xprt->rx_buf;
  1161. struct rpcrdma_ia *ia = &r_xprt->rx_ia;
  1162. struct rpcrdma_ep *ep = &r_xprt->rx_ep;
  1163. struct rpcrdma_rep *rep;
  1164. unsigned long flags;
  1165. int rc;
  1166. while (count--) {
  1167. spin_lock_irqsave(&buffers->rb_lock, flags);
  1168. if (list_empty(&buffers->rb_recv_bufs))
  1169. goto out_reqbuf;
  1170. rep = rpcrdma_buffer_get_rep_locked(buffers);
  1171. spin_unlock_irqrestore(&buffers->rb_lock, flags);
  1172. rc = rpcrdma_ep_post_recv(ia, ep, rep);
  1173. if (rc)
  1174. goto out_rc;
  1175. }
  1176. return 0;
  1177. out_reqbuf:
  1178. spin_unlock_irqrestore(&buffers->rb_lock, flags);
  1179. pr_warn("%s: no extra receive buffers\n", __func__);
  1180. return -ENOMEM;
  1181. out_rc:
  1182. rpcrdma_recv_buffer_put(rep);
  1183. return rc;
  1184. }
  1185. /* How many chunk list items fit within our inline buffers?
  1186. */
  1187. unsigned int
  1188. rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
  1189. {
  1190. struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
  1191. int bytes, segments;
  1192. bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
  1193. bytes -= RPCRDMA_HDRLEN_MIN;
  1194. if (bytes < sizeof(struct rpcrdma_segment) * 2) {
  1195. pr_warn("RPC: %s: inline threshold too small\n",
  1196. __func__);
  1197. return 0;
  1198. }
  1199. segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
  1200. dprintk("RPC: %s: max chunk list size = %d segments\n",
  1201. __func__, segments);
  1202. return segments;
  1203. }