tcp.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. /*
  2. * Copyright (c) 2006 Oracle. All rights reserved.
  3. *
  4. * This software is available to you under a choice of one of two
  5. * licenses. You may choose to be licensed under the terms of the GNU
  6. * General Public License (GPL) Version 2, available from the file
  7. * COPYING in the main directory of this source tree, or the
  8. * OpenIB.org BSD license below:
  9. *
  10. * Redistribution and use in source and binary forms, with or
  11. * without modification, are permitted provided that the following
  12. * conditions are met:
  13. *
  14. * - Redistributions of source code must retain the above
  15. * copyright notice, this list of conditions and the following
  16. * disclaimer.
  17. *
  18. * - Redistributions in binary form must reproduce the above
  19. * copyright notice, this list of conditions and the following
  20. * disclaimer in the documentation and/or other materials
  21. * provided with the distribution.
  22. *
  23. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26. * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27. * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28. * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29. * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30. * SOFTWARE.
  31. *
  32. */
  33. #include <linux/kernel.h>
  34. #include <linux/slab.h>
  35. #include <linux/in.h>
  36. #include <linux/module.h>
  37. #include <net/tcp.h>
  38. #include <net/net_namespace.h>
  39. #include <net/netns/generic.h>
  40. #include <net/tcp.h>
  41. #include "rds.h"
  42. #include "tcp.h"
  43. /* only for info exporting */
  44. static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
  45. static LIST_HEAD(rds_tcp_tc_list);
  46. static unsigned int rds_tcp_tc_count;
  47. /* Track rds_tcp_connection structs so they can be cleaned up */
  48. static DEFINE_SPINLOCK(rds_tcp_conn_lock);
  49. static LIST_HEAD(rds_tcp_conn_list);
  50. static struct kmem_cache *rds_tcp_conn_slab;
  51. #define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
  52. /* doing it this way avoids calling tcp_sk() */
  53. void rds_tcp_nonagle(struct socket *sock)
  54. {
  55. mm_segment_t oldfs = get_fs();
  56. int val = 1;
  57. set_fs(KERNEL_DS);
  58. sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, (char __user *)&val,
  59. sizeof(val));
  60. set_fs(oldfs);
  61. }
  62. /* All module specific customizations to the RDS-TCP socket should be done in
  63. * rds_tcp_tune() and applied after socket creation. In general these
  64. * customizations should be tunable via module_param()
  65. */
  66. void rds_tcp_tune(struct socket *sock)
  67. {
  68. rds_tcp_nonagle(sock);
  69. }
  70. u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
  71. {
  72. return tcp_sk(tc->t_sock->sk)->snd_nxt;
  73. }
  74. u32 rds_tcp_snd_una(struct rds_tcp_connection *tc)
  75. {
  76. return tcp_sk(tc->t_sock->sk)->snd_una;
  77. }
  78. void rds_tcp_restore_callbacks(struct socket *sock,
  79. struct rds_tcp_connection *tc)
  80. {
  81. rdsdebug("restoring sock %p callbacks from tc %p\n", sock, tc);
  82. write_lock_bh(&sock->sk->sk_callback_lock);
  83. /* done under the callback_lock to serialize with write_space */
  84. spin_lock(&rds_tcp_tc_list_lock);
  85. list_del_init(&tc->t_list_item);
  86. rds_tcp_tc_count--;
  87. spin_unlock(&rds_tcp_tc_list_lock);
  88. tc->t_sock = NULL;
  89. sock->sk->sk_write_space = tc->t_orig_write_space;
  90. sock->sk->sk_data_ready = tc->t_orig_data_ready;
  91. sock->sk->sk_state_change = tc->t_orig_state_change;
  92. sock->sk->sk_user_data = NULL;
  93. write_unlock_bh(&sock->sk->sk_callback_lock);
  94. }
  95. /*
  96. * This is the only path that sets tc->t_sock. Send and receive trust that
  97. * it is set. The RDS_CONN_CONNECTED bit protects those paths from being
  98. * called while it isn't set.
  99. */
  100. void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn)
  101. {
  102. struct rds_tcp_connection *tc = conn->c_transport_data;
  103. rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc);
  104. write_lock_bh(&sock->sk->sk_callback_lock);
  105. /* done under the callback_lock to serialize with write_space */
  106. spin_lock(&rds_tcp_tc_list_lock);
  107. list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
  108. rds_tcp_tc_count++;
  109. spin_unlock(&rds_tcp_tc_list_lock);
  110. /* accepted sockets need our listen data ready undone */
  111. if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready)
  112. sock->sk->sk_data_ready = sock->sk->sk_user_data;
  113. tc->t_sock = sock;
  114. tc->conn = conn;
  115. tc->t_orig_data_ready = sock->sk->sk_data_ready;
  116. tc->t_orig_write_space = sock->sk->sk_write_space;
  117. tc->t_orig_state_change = sock->sk->sk_state_change;
  118. sock->sk->sk_user_data = conn;
  119. sock->sk->sk_data_ready = rds_tcp_data_ready;
  120. sock->sk->sk_write_space = rds_tcp_write_space;
  121. sock->sk->sk_state_change = rds_tcp_state_change;
  122. write_unlock_bh(&sock->sk->sk_callback_lock);
  123. }
  124. static void rds_tcp_tc_info(struct socket *sock, unsigned int len,
  125. struct rds_info_iterator *iter,
  126. struct rds_info_lengths *lens)
  127. {
  128. struct rds_info_tcp_socket tsinfo;
  129. struct rds_tcp_connection *tc;
  130. unsigned long flags;
  131. struct sockaddr_in sin;
  132. int sinlen;
  133. spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
  134. if (len / sizeof(tsinfo) < rds_tcp_tc_count)
  135. goto out;
  136. list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
  137. sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 0);
  138. tsinfo.local_addr = sin.sin_addr.s_addr;
  139. tsinfo.local_port = sin.sin_port;
  140. sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 1);
  141. tsinfo.peer_addr = sin.sin_addr.s_addr;
  142. tsinfo.peer_port = sin.sin_port;
  143. tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
  144. tsinfo.data_rem = tc->t_tinc_data_rem;
  145. tsinfo.last_sent_nxt = tc->t_last_sent_nxt;
  146. tsinfo.last_expected_una = tc->t_last_expected_una;
  147. tsinfo.last_seen_una = tc->t_last_seen_una;
  148. rds_info_copy(iter, &tsinfo, sizeof(tsinfo));
  149. }
  150. out:
  151. lens->nr = rds_tcp_tc_count;
  152. lens->each = sizeof(tsinfo);
  153. spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
  154. }
  155. static int rds_tcp_laddr_check(struct net *net, __be32 addr)
  156. {
  157. if (inet_addr_type(net, addr) == RTN_LOCAL)
  158. return 0;
  159. return -EADDRNOTAVAIL;
  160. }
  161. static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
  162. {
  163. struct rds_tcp_connection *tc;
  164. tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
  165. if (!tc)
  166. return -ENOMEM;
  167. tc->t_sock = NULL;
  168. tc->t_tinc = NULL;
  169. tc->t_tinc_hdr_rem = sizeof(struct rds_header);
  170. tc->t_tinc_data_rem = 0;
  171. conn->c_transport_data = tc;
  172. spin_lock_irq(&rds_tcp_conn_lock);
  173. list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
  174. spin_unlock_irq(&rds_tcp_conn_lock);
  175. rdsdebug("alloced tc %p\n", conn->c_transport_data);
  176. return 0;
  177. }
  178. static void rds_tcp_conn_free(void *arg)
  179. {
  180. struct rds_tcp_connection *tc = arg;
  181. unsigned long flags;
  182. rdsdebug("freeing tc %p\n", tc);
  183. spin_lock_irqsave(&rds_tcp_conn_lock, flags);
  184. list_del(&tc->t_tcp_node);
  185. spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
  186. kmem_cache_free(rds_tcp_conn_slab, tc);
  187. }
  188. static void rds_tcp_destroy_conns(void)
  189. {
  190. struct rds_tcp_connection *tc, *_tc;
  191. LIST_HEAD(tmp_list);
  192. /* avoid calling conn_destroy with irqs off */
  193. spin_lock_irq(&rds_tcp_conn_lock);
  194. list_splice(&rds_tcp_conn_list, &tmp_list);
  195. INIT_LIST_HEAD(&rds_tcp_conn_list);
  196. spin_unlock_irq(&rds_tcp_conn_lock);
  197. list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
  198. if (tc->conn->c_passive)
  199. rds_conn_destroy(tc->conn->c_passive);
  200. rds_conn_destroy(tc->conn);
  201. }
  202. }
  203. static void rds_tcp_exit(void);
  204. struct rds_transport rds_tcp_transport = {
  205. .laddr_check = rds_tcp_laddr_check,
  206. .xmit_prepare = rds_tcp_xmit_prepare,
  207. .xmit_complete = rds_tcp_xmit_complete,
  208. .xmit = rds_tcp_xmit,
  209. .recv = rds_tcp_recv,
  210. .conn_alloc = rds_tcp_conn_alloc,
  211. .conn_free = rds_tcp_conn_free,
  212. .conn_connect = rds_tcp_conn_connect,
  213. .conn_shutdown = rds_tcp_conn_shutdown,
  214. .inc_copy_to_user = rds_tcp_inc_copy_to_user,
  215. .inc_free = rds_tcp_inc_free,
  216. .stats_info_copy = rds_tcp_stats_info_copy,
  217. .exit = rds_tcp_exit,
  218. .t_owner = THIS_MODULE,
  219. .t_name = "tcp",
  220. .t_type = RDS_TRANS_TCP,
  221. .t_prefer_loopback = 1,
  222. };
  223. static int rds_tcp_netid;
  224. /* per-network namespace private data for this module */
  225. struct rds_tcp_net {
  226. struct socket *rds_tcp_listen_sock;
  227. struct work_struct rds_tcp_accept_w;
  228. };
  229. static void rds_tcp_accept_worker(struct work_struct *work)
  230. {
  231. struct rds_tcp_net *rtn = container_of(work,
  232. struct rds_tcp_net,
  233. rds_tcp_accept_w);
  234. while (rds_tcp_accept_one(rtn->rds_tcp_listen_sock) == 0)
  235. cond_resched();
  236. }
  237. void rds_tcp_accept_work(struct sock *sk)
  238. {
  239. struct net *net = sock_net(sk);
  240. struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
  241. queue_work(rds_wq, &rtn->rds_tcp_accept_w);
  242. }
  243. static __net_init int rds_tcp_init_net(struct net *net)
  244. {
  245. struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
  246. rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
  247. if (!rtn->rds_tcp_listen_sock) {
  248. pr_warn("could not set up listen sock\n");
  249. return -EAFNOSUPPORT;
  250. }
  251. INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker);
  252. return 0;
  253. }
  254. static void __net_exit rds_tcp_exit_net(struct net *net)
  255. {
  256. struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
  257. /* If rds_tcp_exit_net() is called as a result of netns deletion,
  258. * the rds_tcp_kill_sock() device notifier would already have cleaned
  259. * up the listen socket, thus there is no work to do in this function.
  260. *
  261. * If rds_tcp_exit_net() is called as a result of module unload,
  262. * i.e., due to rds_tcp_exit() -> unregister_pernet_subsys(), then
  263. * we do need to clean up the listen socket here.
  264. */
  265. if (rtn->rds_tcp_listen_sock) {
  266. rds_tcp_listen_stop(rtn->rds_tcp_listen_sock);
  267. rtn->rds_tcp_listen_sock = NULL;
  268. flush_work(&rtn->rds_tcp_accept_w);
  269. }
  270. }
  271. static struct pernet_operations rds_tcp_net_ops = {
  272. .init = rds_tcp_init_net,
  273. .exit = rds_tcp_exit_net,
  274. .id = &rds_tcp_netid,
  275. .size = sizeof(struct rds_tcp_net),
  276. };
  277. static void rds_tcp_kill_sock(struct net *net)
  278. {
  279. struct rds_tcp_connection *tc, *_tc;
  280. struct sock *sk;
  281. LIST_HEAD(tmp_list);
  282. struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
  283. rds_tcp_listen_stop(rtn->rds_tcp_listen_sock);
  284. rtn->rds_tcp_listen_sock = NULL;
  285. flush_work(&rtn->rds_tcp_accept_w);
  286. spin_lock_irq(&rds_tcp_conn_lock);
  287. list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
  288. struct net *c_net = read_pnet(&tc->conn->c_net);
  289. if (net != c_net)
  290. continue;
  291. list_move_tail(&tc->t_tcp_node, &tmp_list);
  292. }
  293. spin_unlock_irq(&rds_tcp_conn_lock);
  294. list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
  295. sk = tc->t_sock->sk;
  296. sk->sk_prot->disconnect(sk, 0);
  297. tcp_done(sk);
  298. if (tc->conn->c_passive)
  299. rds_conn_destroy(tc->conn->c_passive);
  300. rds_conn_destroy(tc->conn);
  301. }
  302. }
  303. static int rds_tcp_dev_event(struct notifier_block *this,
  304. unsigned long event, void *ptr)
  305. {
  306. struct net_device *dev = netdev_notifier_info_to_dev(ptr);
  307. /* rds-tcp registers as a pernet subys, so the ->exit will only
  308. * get invoked after network acitivity has quiesced. We need to
  309. * clean up all sockets to quiesce network activity, and use
  310. * the unregistration of the per-net loopback device as a trigger
  311. * to start that cleanup.
  312. */
  313. if (event == NETDEV_UNREGISTER_FINAL &&
  314. dev->ifindex == LOOPBACK_IFINDEX)
  315. rds_tcp_kill_sock(dev_net(dev));
  316. return NOTIFY_DONE;
  317. }
  318. static struct notifier_block rds_tcp_dev_notifier = {
  319. .notifier_call = rds_tcp_dev_event,
  320. .priority = -10, /* must be called after other network notifiers */
  321. };
  322. static void rds_tcp_exit(void)
  323. {
  324. rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
  325. unregister_pernet_subsys(&rds_tcp_net_ops);
  326. if (unregister_netdevice_notifier(&rds_tcp_dev_notifier))
  327. pr_warn("could not unregister rds_tcp_dev_notifier\n");
  328. rds_tcp_destroy_conns();
  329. rds_trans_unregister(&rds_tcp_transport);
  330. rds_tcp_recv_exit();
  331. kmem_cache_destroy(rds_tcp_conn_slab);
  332. }
  333. module_exit(rds_tcp_exit);
  334. static int rds_tcp_init(void)
  335. {
  336. int ret;
  337. rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
  338. sizeof(struct rds_tcp_connection),
  339. 0, 0, NULL);
  340. if (!rds_tcp_conn_slab) {
  341. ret = -ENOMEM;
  342. goto out;
  343. }
  344. ret = register_netdevice_notifier(&rds_tcp_dev_notifier);
  345. if (ret) {
  346. pr_warn("could not register rds_tcp_dev_notifier\n");
  347. goto out;
  348. }
  349. ret = register_pernet_subsys(&rds_tcp_net_ops);
  350. if (ret)
  351. goto out_slab;
  352. ret = rds_tcp_recv_init();
  353. if (ret)
  354. goto out_pernet;
  355. ret = rds_trans_register(&rds_tcp_transport);
  356. if (ret)
  357. goto out_recv;
  358. rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
  359. goto out;
  360. out_recv:
  361. rds_tcp_recv_exit();
  362. out_pernet:
  363. unregister_pernet_subsys(&rds_tcp_net_ops);
  364. out_slab:
  365. kmem_cache_destroy(rds_tcp_conn_slab);
  366. out:
  367. return ret;
  368. }
  369. module_init(rds_tcp_init);
  370. MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
  371. MODULE_DESCRIPTION("RDS: TCP transport");
  372. MODULE_LICENSE("Dual BSD/GPL");