ip_vs_proto_tcp.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729
  1. /*
  2. * ip_vs_proto_tcp.c: TCP load balancing support for IPVS
  3. *
  4. * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
  5. * Julian Anastasov <ja@ssi.bg>
  6. *
  7. * This program is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU General Public License
  9. * as published by the Free Software Foundation; either version
  10. * 2 of the License, or (at your option) any later version.
  11. *
  12. * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
  13. *
  14. * Network name space (netns) aware.
  15. * Global data moved to netns i.e struct netns_ipvs
  16. * tcp_timeouts table has copy per netns in a hash table per
  17. * protocol ip_vs_proto_data and is handled by netns
  18. */
  19. #define KMSG_COMPONENT "IPVS"
  20. #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  21. #include <linux/kernel.h>
  22. #include <linux/ip.h>
  23. #include <linux/tcp.h> /* for tcphdr */
  24. #include <net/ip.h>
  25. #include <net/tcp.h> /* for csum_tcpudp_magic */
  26. #include <net/ip6_checksum.h>
  27. #include <linux/netfilter.h>
  28. #include <linux/netfilter_ipv4.h>
  29. #include <net/ip_vs.h>
  30. static int
  31. tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
  32. struct ip_vs_proto_data *pd,
  33. int *verdict, struct ip_vs_conn **cpp,
  34. struct ip_vs_iphdr *iph)
  35. {
  36. struct ip_vs_service *svc;
  37. struct tcphdr _tcph, *th;
  38. __be16 _ports[2], *ports = NULL;
  39. /* In the event of icmp, we're only guaranteed to have the first 8
  40. * bytes of the transport header, so we only check the rest of the
  41. * TCP packet for non-ICMP packets
  42. */
  43. if (likely(!ip_vs_iph_icmp(iph))) {
  44. th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
  45. if (th) {
  46. if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn))
  47. return 1;
  48. ports = &th->source;
  49. }
  50. } else {
  51. ports = skb_header_pointer(
  52. skb, iph->len, sizeof(_ports), &_ports);
  53. }
  54. if (!ports) {
  55. *verdict = NF_DROP;
  56. return 0;
  57. }
  58. /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
  59. rcu_read_lock();
  60. if (likely(!ip_vs_iph_inverse(iph)))
  61. svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
  62. &iph->daddr, ports[1]);
  63. else
  64. svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
  65. &iph->saddr, ports[0]);
  66. if (svc) {
  67. int ignored;
  68. if (ip_vs_todrop(ipvs)) {
  69. /*
  70. * It seems that we are very loaded.
  71. * We have to drop this packet :(
  72. */
  73. rcu_read_unlock();
  74. *verdict = NF_DROP;
  75. return 0;
  76. }
  77. /*
  78. * Let the virtual server select a real server for the
  79. * incoming connection, and create a connection entry.
  80. */
  81. *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
  82. if (!*cpp && ignored <= 0) {
  83. if (!ignored)
  84. *verdict = ip_vs_leave(svc, skb, pd, iph);
  85. else
  86. *verdict = NF_DROP;
  87. rcu_read_unlock();
  88. return 0;
  89. }
  90. }
  91. rcu_read_unlock();
  92. /* NF_ACCEPT */
  93. return 1;
  94. }
  95. static inline void
  96. tcp_fast_csum_update(int af, struct tcphdr *tcph,
  97. const union nf_inet_addr *oldip,
  98. const union nf_inet_addr *newip,
  99. __be16 oldport, __be16 newport)
  100. {
  101. #ifdef CONFIG_IP_VS_IPV6
  102. if (af == AF_INET6)
  103. tcph->check =
  104. csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
  105. ip_vs_check_diff2(oldport, newport,
  106. ~csum_unfold(tcph->check))));
  107. else
  108. #endif
  109. tcph->check =
  110. csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
  111. ip_vs_check_diff2(oldport, newport,
  112. ~csum_unfold(tcph->check))));
  113. }
  114. static inline void
  115. tcp_partial_csum_update(int af, struct tcphdr *tcph,
  116. const union nf_inet_addr *oldip,
  117. const union nf_inet_addr *newip,
  118. __be16 oldlen, __be16 newlen)
  119. {
  120. #ifdef CONFIG_IP_VS_IPV6
  121. if (af == AF_INET6)
  122. tcph->check =
  123. ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
  124. ip_vs_check_diff2(oldlen, newlen,
  125. csum_unfold(tcph->check))));
  126. else
  127. #endif
  128. tcph->check =
  129. ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
  130. ip_vs_check_diff2(oldlen, newlen,
  131. csum_unfold(tcph->check))));
  132. }
  133. static int
  134. tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
  135. struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
  136. {
  137. struct tcphdr *tcph;
  138. unsigned int tcphoff = iph->len;
  139. int oldlen;
  140. int payload_csum = 0;
  141. #ifdef CONFIG_IP_VS_IPV6
  142. if (cp->af == AF_INET6 && iph->fragoffs)
  143. return 1;
  144. #endif
  145. oldlen = skb->len - tcphoff;
  146. /* csum_check requires unshared skb */
  147. if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
  148. return 0;
  149. if (unlikely(cp->app != NULL)) {
  150. int ret;
  151. /* Some checks before mangling */
  152. if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
  153. return 0;
  154. /* Call application helper if needed */
  155. if (!(ret = ip_vs_app_pkt_out(cp, skb)))
  156. return 0;
  157. /* ret=2: csum update is needed after payload mangling */
  158. if (ret == 1)
  159. oldlen = skb->len - tcphoff;
  160. else
  161. payload_csum = 1;
  162. }
  163. tcph = (void *)skb_network_header(skb) + tcphoff;
  164. tcph->source = cp->vport;
  165. /* Adjust TCP checksums */
  166. if (skb->ip_summed == CHECKSUM_PARTIAL) {
  167. tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
  168. htons(oldlen),
  169. htons(skb->len - tcphoff));
  170. } else if (!payload_csum) {
  171. /* Only port and addr are changed, do fast csum update */
  172. tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
  173. cp->dport, cp->vport);
  174. if (skb->ip_summed == CHECKSUM_COMPLETE)
  175. skb->ip_summed = (cp->app && pp->csum_check) ?
  176. CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
  177. } else {
  178. /* full checksum calculation */
  179. tcph->check = 0;
  180. skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
  181. #ifdef CONFIG_IP_VS_IPV6
  182. if (cp->af == AF_INET6)
  183. tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
  184. &cp->caddr.in6,
  185. skb->len - tcphoff,
  186. cp->protocol, skb->csum);
  187. else
  188. #endif
  189. tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
  190. cp->caddr.ip,
  191. skb->len - tcphoff,
  192. cp->protocol,
  193. skb->csum);
  194. skb->ip_summed = CHECKSUM_UNNECESSARY;
  195. IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
  196. pp->name, tcph->check,
  197. (char*)&(tcph->check) - (char*)tcph);
  198. }
  199. return 1;
  200. }
  201. static int
  202. tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
  203. struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
  204. {
  205. struct tcphdr *tcph;
  206. unsigned int tcphoff = iph->len;
  207. int oldlen;
  208. int payload_csum = 0;
  209. #ifdef CONFIG_IP_VS_IPV6
  210. if (cp->af == AF_INET6 && iph->fragoffs)
  211. return 1;
  212. #endif
  213. oldlen = skb->len - tcphoff;
  214. /* csum_check requires unshared skb */
  215. if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
  216. return 0;
  217. if (unlikely(cp->app != NULL)) {
  218. int ret;
  219. /* Some checks before mangling */
  220. if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
  221. return 0;
  222. /*
  223. * Attempt ip_vs_app call.
  224. * It will fix ip_vs_conn and iph ack_seq stuff
  225. */
  226. if (!(ret = ip_vs_app_pkt_in(cp, skb)))
  227. return 0;
  228. /* ret=2: csum update is needed after payload mangling */
  229. if (ret == 1)
  230. oldlen = skb->len - tcphoff;
  231. else
  232. payload_csum = 1;
  233. }
  234. tcph = (void *)skb_network_header(skb) + tcphoff;
  235. tcph->dest = cp->dport;
  236. /*
  237. * Adjust TCP checksums
  238. */
  239. if (skb->ip_summed == CHECKSUM_PARTIAL) {
  240. tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
  241. htons(oldlen),
  242. htons(skb->len - tcphoff));
  243. } else if (!payload_csum) {
  244. /* Only port and addr are changed, do fast csum update */
  245. tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
  246. cp->vport, cp->dport);
  247. if (skb->ip_summed == CHECKSUM_COMPLETE)
  248. skb->ip_summed = (cp->app && pp->csum_check) ?
  249. CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
  250. } else {
  251. /* full checksum calculation */
  252. tcph->check = 0;
  253. skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
  254. #ifdef CONFIG_IP_VS_IPV6
  255. if (cp->af == AF_INET6)
  256. tcph->check = csum_ipv6_magic(&cp->caddr.in6,
  257. &cp->daddr.in6,
  258. skb->len - tcphoff,
  259. cp->protocol, skb->csum);
  260. else
  261. #endif
  262. tcph->check = csum_tcpudp_magic(cp->caddr.ip,
  263. cp->daddr.ip,
  264. skb->len - tcphoff,
  265. cp->protocol,
  266. skb->csum);
  267. skb->ip_summed = CHECKSUM_UNNECESSARY;
  268. }
  269. return 1;
  270. }
  271. static int
  272. tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
  273. {
  274. unsigned int tcphoff;
  275. #ifdef CONFIG_IP_VS_IPV6
  276. if (af == AF_INET6)
  277. tcphoff = sizeof(struct ipv6hdr);
  278. else
  279. #endif
  280. tcphoff = ip_hdrlen(skb);
  281. switch (skb->ip_summed) {
  282. case CHECKSUM_NONE:
  283. skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
  284. case CHECKSUM_COMPLETE:
  285. #ifdef CONFIG_IP_VS_IPV6
  286. if (af == AF_INET6) {
  287. if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
  288. &ipv6_hdr(skb)->daddr,
  289. skb->len - tcphoff,
  290. ipv6_hdr(skb)->nexthdr,
  291. skb->csum)) {
  292. IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
  293. "Failed checksum for");
  294. return 0;
  295. }
  296. } else
  297. #endif
  298. if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
  299. ip_hdr(skb)->daddr,
  300. skb->len - tcphoff,
  301. ip_hdr(skb)->protocol,
  302. skb->csum)) {
  303. IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
  304. "Failed checksum for");
  305. return 0;
  306. }
  307. break;
  308. default:
  309. /* No need to checksum. */
  310. break;
  311. }
  312. return 1;
  313. }
  314. #define TCP_DIR_INPUT 0
  315. #define TCP_DIR_OUTPUT 4
  316. #define TCP_DIR_INPUT_ONLY 8
  317. static const int tcp_state_off[IP_VS_DIR_LAST] = {
  318. [IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
  319. [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
  320. [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
  321. };
  322. /*
  323. * Timeout table[state]
  324. */
  325. static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
  326. [IP_VS_TCP_S_NONE] = 2*HZ,
  327. [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
  328. [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
  329. [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
  330. [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
  331. [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
  332. [IP_VS_TCP_S_CLOSE] = 10*HZ,
  333. [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
  334. [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
  335. [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
  336. [IP_VS_TCP_S_SYNACK] = 120*HZ,
  337. [IP_VS_TCP_S_LAST] = 2*HZ,
  338. };
  339. static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
  340. [IP_VS_TCP_S_NONE] = "NONE",
  341. [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
  342. [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
  343. [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
  344. [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
  345. [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
  346. [IP_VS_TCP_S_CLOSE] = "CLOSE",
  347. [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
  348. [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
  349. [IP_VS_TCP_S_LISTEN] = "LISTEN",
  350. [IP_VS_TCP_S_SYNACK] = "SYNACK",
  351. [IP_VS_TCP_S_LAST] = "BUG!",
  352. };
  353. #define sNO IP_VS_TCP_S_NONE
  354. #define sES IP_VS_TCP_S_ESTABLISHED
  355. #define sSS IP_VS_TCP_S_SYN_SENT
  356. #define sSR IP_VS_TCP_S_SYN_RECV
  357. #define sFW IP_VS_TCP_S_FIN_WAIT
  358. #define sTW IP_VS_TCP_S_TIME_WAIT
  359. #define sCL IP_VS_TCP_S_CLOSE
  360. #define sCW IP_VS_TCP_S_CLOSE_WAIT
  361. #define sLA IP_VS_TCP_S_LAST_ACK
  362. #define sLI IP_VS_TCP_S_LISTEN
  363. #define sSA IP_VS_TCP_S_SYNACK
  364. struct tcp_states_t {
  365. int next_state[IP_VS_TCP_S_LAST];
  366. };
  367. static const char * tcp_state_name(int state)
  368. {
  369. if (state >= IP_VS_TCP_S_LAST)
  370. return "ERR!";
  371. return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
  372. }
  373. static struct tcp_states_t tcp_states [] = {
  374. /* INPUT */
  375. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  376. /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
  377. /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
  378. /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
  379. /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
  380. /* OUTPUT */
  381. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  382. /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
  383. /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
  384. /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
  385. /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
  386. /* INPUT-ONLY */
  387. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  388. /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
  389. /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
  390. /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
  391. /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
  392. };
  393. static struct tcp_states_t tcp_states_dos [] = {
  394. /* INPUT */
  395. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  396. /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
  397. /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
  398. /*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
  399. /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
  400. /* OUTPUT */
  401. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  402. /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
  403. /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
  404. /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
  405. /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
  406. /* INPUT-ONLY */
  407. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  408. /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
  409. /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
  410. /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
  411. /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
  412. };
  413. static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
  414. {
  415. int on = (flags & 1); /* secure_tcp */
  416. /*
  417. ** FIXME: change secure_tcp to independent sysctl var
  418. ** or make it per-service or per-app because it is valid
  419. ** for most if not for all of the applications. Something
  420. ** like "capabilities" (flags) for each object.
  421. */
  422. pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
  423. }
  424. static inline int tcp_state_idx(struct tcphdr *th)
  425. {
  426. if (th->rst)
  427. return 3;
  428. if (th->syn)
  429. return 0;
  430. if (th->fin)
  431. return 1;
  432. if (th->ack)
  433. return 2;
  434. return -1;
  435. }
  436. static inline void
  437. set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
  438. int direction, struct tcphdr *th)
  439. {
  440. int state_idx;
  441. int new_state = IP_VS_TCP_S_CLOSE;
  442. int state_off = tcp_state_off[direction];
  443. /*
  444. * Update state offset to INPUT_ONLY if necessary
  445. * or delete NO_OUTPUT flag if output packet detected
  446. */
  447. if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
  448. if (state_off == TCP_DIR_OUTPUT)
  449. cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
  450. else
  451. state_off = TCP_DIR_INPUT_ONLY;
  452. }
  453. if ((state_idx = tcp_state_idx(th)) < 0) {
  454. IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
  455. goto tcp_state_out;
  456. }
  457. new_state =
  458. pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
  459. tcp_state_out:
  460. if (new_state != cp->state) {
  461. struct ip_vs_dest *dest = cp->dest;
  462. IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
  463. "%s:%d state: %s->%s conn->refcnt:%d\n",
  464. pd->pp->name,
  465. ((state_off == TCP_DIR_OUTPUT) ?
  466. "output " : "input "),
  467. th->syn ? 'S' : '.',
  468. th->fin ? 'F' : '.',
  469. th->ack ? 'A' : '.',
  470. th->rst ? 'R' : '.',
  471. IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
  472. ntohs(cp->dport),
  473. IP_VS_DBG_ADDR(cp->af, &cp->caddr),
  474. ntohs(cp->cport),
  475. tcp_state_name(cp->state),
  476. tcp_state_name(new_state),
  477. atomic_read(&cp->refcnt));
  478. if (dest) {
  479. if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
  480. (new_state != IP_VS_TCP_S_ESTABLISHED)) {
  481. atomic_dec(&dest->activeconns);
  482. atomic_inc(&dest->inactconns);
  483. cp->flags |= IP_VS_CONN_F_INACTIVE;
  484. } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
  485. (new_state == IP_VS_TCP_S_ESTABLISHED)) {
  486. atomic_inc(&dest->activeconns);
  487. atomic_dec(&dest->inactconns);
  488. cp->flags &= ~IP_VS_CONN_F_INACTIVE;
  489. }
  490. }
  491. }
  492. if (likely(pd))
  493. cp->timeout = pd->timeout_table[cp->state = new_state];
  494. else /* What to do ? */
  495. cp->timeout = tcp_timeouts[cp->state = new_state];
  496. }
  497. /*
  498. * Handle state transitions
  499. */
  500. static void
  501. tcp_state_transition(struct ip_vs_conn *cp, int direction,
  502. const struct sk_buff *skb,
  503. struct ip_vs_proto_data *pd)
  504. {
  505. struct tcphdr _tcph, *th;
  506. #ifdef CONFIG_IP_VS_IPV6
  507. int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
  508. #else
  509. int ihl = ip_hdrlen(skb);
  510. #endif
  511. th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
  512. if (th == NULL)
  513. return;
  514. spin_lock_bh(&cp->lock);
  515. set_tcp_state(pd, cp, direction, th);
  516. spin_unlock_bh(&cp->lock);
  517. }
  518. static inline __u16 tcp_app_hashkey(__be16 port)
  519. {
  520. return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
  521. & TCP_APP_TAB_MASK;
  522. }
  523. static int tcp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
  524. {
  525. struct ip_vs_app *i;
  526. __u16 hash;
  527. __be16 port = inc->port;
  528. int ret = 0;
  529. struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
  530. hash = tcp_app_hashkey(port);
  531. list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
  532. if (i->port == port) {
  533. ret = -EEXIST;
  534. goto out;
  535. }
  536. }
  537. list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]);
  538. atomic_inc(&pd->appcnt);
  539. out:
  540. return ret;
  541. }
  542. static void
  543. tcp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
  544. {
  545. struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
  546. atomic_dec(&pd->appcnt);
  547. list_del_rcu(&inc->p_list);
  548. }
  549. static int
  550. tcp_app_conn_bind(struct ip_vs_conn *cp)
  551. {
  552. struct netns_ipvs *ipvs = cp->ipvs;
  553. int hash;
  554. struct ip_vs_app *inc;
  555. int result = 0;
  556. /* Default binding: bind app only for NAT */
  557. if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
  558. return 0;
  559. /* Lookup application incarnations and bind the right one */
  560. hash = tcp_app_hashkey(cp->vport);
  561. rcu_read_lock();
  562. list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {
  563. if (inc->port == cp->vport) {
  564. if (unlikely(!ip_vs_app_inc_get(inc)))
  565. break;
  566. rcu_read_unlock();
  567. IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
  568. "%s:%u to app %s on port %u\n",
  569. __func__,
  570. IP_VS_DBG_ADDR(cp->af, &cp->caddr),
  571. ntohs(cp->cport),
  572. IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
  573. ntohs(cp->vport),
  574. inc->name, ntohs(inc->port));
  575. cp->app = inc;
  576. if (inc->init_conn)
  577. result = inc->init_conn(inc, cp);
  578. goto out;
  579. }
  580. }
  581. rcu_read_unlock();
  582. out:
  583. return result;
  584. }
  585. /*
  586. * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
  587. */
  588. void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
  589. {
  590. struct ip_vs_proto_data *pd = ip_vs_proto_data_get(cp->ipvs, IPPROTO_TCP);
  591. spin_lock_bh(&cp->lock);
  592. cp->state = IP_VS_TCP_S_LISTEN;
  593. cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
  594. : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
  595. spin_unlock_bh(&cp->lock);
  596. }
  597. /* ---------------------------------------------
  598. * timeouts is netns related now.
  599. * ---------------------------------------------
  600. */
  601. static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
  602. {
  603. ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
  604. pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
  605. sizeof(tcp_timeouts));
  606. if (!pd->timeout_table)
  607. return -ENOMEM;
  608. pd->tcp_state_table = tcp_states;
  609. return 0;
  610. }
  611. static void __ip_vs_tcp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
  612. {
  613. kfree(pd->timeout_table);
  614. }
  615. struct ip_vs_protocol ip_vs_protocol_tcp = {
  616. .name = "TCP",
  617. .protocol = IPPROTO_TCP,
  618. .num_states = IP_VS_TCP_S_LAST,
  619. .dont_defrag = 0,
  620. .init = NULL,
  621. .exit = NULL,
  622. .init_netns = __ip_vs_tcp_init,
  623. .exit_netns = __ip_vs_tcp_exit,
  624. .register_app = tcp_register_app,
  625. .unregister_app = tcp_unregister_app,
  626. .conn_schedule = tcp_conn_schedule,
  627. .conn_in_get = ip_vs_conn_in_get_proto,
  628. .conn_out_get = ip_vs_conn_out_get_proto,
  629. .snat_handler = tcp_snat_handler,
  630. .dnat_handler = tcp_dnat_handler,
  631. .csum_check = tcp_csum_check,
  632. .state_name = tcp_state_name,
  633. .state_transition = tcp_state_transition,
  634. .app_conn_bind = tcp_app_conn_bind,
  635. .debug_packet = ip_vs_tcpudp_debug_packet,
  636. .timeout_change = tcp_timeout_change,
  637. };