ip_vs_conn.c 35 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417
  1. /*
  2. * IPVS An implementation of the IP virtual server support for the
  3. * LINUX operating system. IPVS is now implemented as a module
  4. * over the Netfilter framework. IPVS can be used to build a
  5. * high-performance and highly available server based on a
  6. * cluster of servers.
  7. *
  8. * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
  9. * Peter Kese <peter.kese@ijs.si>
  10. * Julian Anastasov <ja@ssi.bg>
  11. *
  12. * This program is free software; you can redistribute it and/or
  13. * modify it under the terms of the GNU General Public License
  14. * as published by the Free Software Foundation; either version
  15. * 2 of the License, or (at your option) any later version.
  16. *
  17. * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
  18. * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
  19. * and others. Many code here is taken from IP MASQ code of kernel 2.2.
  20. *
  21. * Changes:
  22. *
  23. */
  24. #define KMSG_COMPONENT "IPVS"
  25. #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  26. #include <linux/interrupt.h>
  27. #include <linux/in.h>
  28. #include <linux/inet.h>
  29. #include <linux/net.h>
  30. #include <linux/kernel.h>
  31. #include <linux/module.h>
  32. #include <linux/vmalloc.h>
  33. #include <linux/proc_fs.h> /* for proc_net_* */
  34. #include <linux/slab.h>
  35. #include <linux/seq_file.h>
  36. #include <linux/jhash.h>
  37. #include <linux/random.h>
  38. #include <net/net_namespace.h>
  39. #include <net/ip_vs.h>
  40. #ifndef CONFIG_IP_VS_TAB_BITS
  41. #define CONFIG_IP_VS_TAB_BITS 12
  42. #endif
  43. /*
  44. * Connection hash size. Default is what was selected at compile time.
  45. */
  46. static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
  47. module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
  48. MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
  49. /* size and mask values */
  50. int ip_vs_conn_tab_size __read_mostly;
  51. static int ip_vs_conn_tab_mask __read_mostly;
  52. /*
  53. * Connection hash table: for input and output packets lookups of IPVS
  54. */
  55. static struct hlist_head *ip_vs_conn_tab __read_mostly;
  56. /* SLAB cache for IPVS connections */
  57. static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
  58. /* counter for no client port connections */
  59. static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
  60. /* random value for IPVS connection hash */
  61. static unsigned int ip_vs_conn_rnd __read_mostly;
  62. /*
  63. * Fine locking granularity for big connection hash table
  64. */
  65. #define CT_LOCKARRAY_BITS 5
  66. #define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
  67. #define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
  68. /* We need an addrstrlen that works with or without v6 */
  69. #ifdef CONFIG_IP_VS_IPV6
  70. #define IP_VS_ADDRSTRLEN INET6_ADDRSTRLEN
  71. #else
  72. #define IP_VS_ADDRSTRLEN (8+1)
  73. #endif
  74. struct ip_vs_aligned_lock
  75. {
  76. spinlock_t l;
  77. } __attribute__((__aligned__(SMP_CACHE_BYTES)));
  78. /* lock array for conn table */
  79. static struct ip_vs_aligned_lock
  80. __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
  81. static inline void ct_write_lock_bh(unsigned int key)
  82. {
  83. spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
  84. }
  85. static inline void ct_write_unlock_bh(unsigned int key)
  86. {
  87. spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
  88. }
  89. /*
  90. * Returns hash value for IPVS connection entry
  91. */
  92. static unsigned int ip_vs_conn_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto,
  93. const union nf_inet_addr *addr,
  94. __be16 port)
  95. {
  96. #ifdef CONFIG_IP_VS_IPV6
  97. if (af == AF_INET6)
  98. return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
  99. (__force u32)port, proto, ip_vs_conn_rnd) ^
  100. ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask;
  101. #endif
  102. return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
  103. ip_vs_conn_rnd) ^
  104. ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask;
  105. }
  106. static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
  107. bool inverse)
  108. {
  109. const union nf_inet_addr *addr;
  110. __be16 port;
  111. if (p->pe_data && p->pe->hashkey_raw)
  112. return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) &
  113. ip_vs_conn_tab_mask;
  114. if (likely(!inverse)) {
  115. addr = p->caddr;
  116. port = p->cport;
  117. } else {
  118. addr = p->vaddr;
  119. port = p->vport;
  120. }
  121. return ip_vs_conn_hashkey(p->ipvs, p->af, p->protocol, addr, port);
  122. }
  123. static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
  124. {
  125. struct ip_vs_conn_param p;
  126. ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol,
  127. &cp->caddr, cp->cport, NULL, 0, &p);
  128. if (cp->pe) {
  129. p.pe = cp->pe;
  130. p.pe_data = cp->pe_data;
  131. p.pe_data_len = cp->pe_data_len;
  132. }
  133. return ip_vs_conn_hashkey_param(&p, false);
  134. }
  135. /*
  136. * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port.
  137. * returns bool success.
  138. */
  139. static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
  140. {
  141. unsigned int hash;
  142. int ret;
  143. if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
  144. return 0;
  145. /* Hash by protocol, client address and port */
  146. hash = ip_vs_conn_hashkey_conn(cp);
  147. ct_write_lock_bh(hash);
  148. spin_lock(&cp->lock);
  149. if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
  150. cp->flags |= IP_VS_CONN_F_HASHED;
  151. atomic_inc(&cp->refcnt);
  152. hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]);
  153. ret = 1;
  154. } else {
  155. pr_err("%s(): request for already hashed, called from %pF\n",
  156. __func__, __builtin_return_address(0));
  157. ret = 0;
  158. }
  159. spin_unlock(&cp->lock);
  160. ct_write_unlock_bh(hash);
  161. return ret;
  162. }
  163. /*
  164. * UNhashes ip_vs_conn from ip_vs_conn_tab.
  165. * returns bool success. Caller should hold conn reference.
  166. */
  167. static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
  168. {
  169. unsigned int hash;
  170. int ret;
  171. /* unhash it and decrease its reference counter */
  172. hash = ip_vs_conn_hashkey_conn(cp);
  173. ct_write_lock_bh(hash);
  174. spin_lock(&cp->lock);
  175. if (cp->flags & IP_VS_CONN_F_HASHED) {
  176. hlist_del_rcu(&cp->c_list);
  177. cp->flags &= ~IP_VS_CONN_F_HASHED;
  178. atomic_dec(&cp->refcnt);
  179. ret = 1;
  180. } else
  181. ret = 0;
  182. spin_unlock(&cp->lock);
  183. ct_write_unlock_bh(hash);
  184. return ret;
  185. }
  186. /* Try to unlink ip_vs_conn from ip_vs_conn_tab.
  187. * returns bool success.
  188. */
  189. static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
  190. {
  191. unsigned int hash;
  192. bool ret;
  193. hash = ip_vs_conn_hashkey_conn(cp);
  194. ct_write_lock_bh(hash);
  195. spin_lock(&cp->lock);
  196. if (cp->flags & IP_VS_CONN_F_HASHED) {
  197. ret = false;
  198. /* Decrease refcnt and unlink conn only if we are last user */
  199. if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) {
  200. hlist_del_rcu(&cp->c_list);
  201. cp->flags &= ~IP_VS_CONN_F_HASHED;
  202. ret = true;
  203. }
  204. } else
  205. ret = atomic_read(&cp->refcnt) ? false : true;
  206. spin_unlock(&cp->lock);
  207. ct_write_unlock_bh(hash);
  208. return ret;
  209. }
  210. /*
  211. * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
  212. * Called for pkts coming from OUTside-to-INside.
  213. * p->caddr, p->cport: pkt source address (foreign host)
  214. * p->vaddr, p->vport: pkt dest address (load balancer)
  215. */
  216. static inline struct ip_vs_conn *
  217. __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
  218. {
  219. unsigned int hash;
  220. struct ip_vs_conn *cp;
  221. hash = ip_vs_conn_hashkey_param(p, false);
  222. rcu_read_lock();
  223. hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
  224. if (p->cport == cp->cport && p->vport == cp->vport &&
  225. cp->af == p->af &&
  226. ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
  227. ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
  228. ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
  229. p->protocol == cp->protocol &&
  230. cp->ipvs == p->ipvs) {
  231. if (!__ip_vs_conn_get(cp))
  232. continue;
  233. /* HIT */
  234. rcu_read_unlock();
  235. return cp;
  236. }
  237. }
  238. rcu_read_unlock();
  239. return NULL;
  240. }
  241. struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
  242. {
  243. struct ip_vs_conn *cp;
  244. cp = __ip_vs_conn_in_get(p);
  245. if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) {
  246. struct ip_vs_conn_param cport_zero_p = *p;
  247. cport_zero_p.cport = 0;
  248. cp = __ip_vs_conn_in_get(&cport_zero_p);
  249. }
  250. IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
  251. ip_vs_proto_name(p->protocol),
  252. IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
  253. IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
  254. cp ? "hit" : "not hit");
  255. return cp;
  256. }
  257. static int
  258. ip_vs_conn_fill_param_proto(struct netns_ipvs *ipvs,
  259. int af, const struct sk_buff *skb,
  260. const struct ip_vs_iphdr *iph,
  261. struct ip_vs_conn_param *p)
  262. {
  263. __be16 _ports[2], *pptr;
  264. pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
  265. if (pptr == NULL)
  266. return 1;
  267. if (likely(!ip_vs_iph_inverse(iph)))
  268. ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->saddr,
  269. pptr[0], &iph->daddr, pptr[1], p);
  270. else
  271. ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->daddr,
  272. pptr[1], &iph->saddr, pptr[0], p);
  273. return 0;
  274. }
  275. struct ip_vs_conn *
  276. ip_vs_conn_in_get_proto(struct netns_ipvs *ipvs, int af,
  277. const struct sk_buff *skb,
  278. const struct ip_vs_iphdr *iph)
  279. {
  280. struct ip_vs_conn_param p;
  281. if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p))
  282. return NULL;
  283. return ip_vs_conn_in_get(&p);
  284. }
  285. EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
  286. /* Get reference to connection template */
  287. struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
  288. {
  289. unsigned int hash;
  290. struct ip_vs_conn *cp;
  291. hash = ip_vs_conn_hashkey_param(p, false);
  292. rcu_read_lock();
  293. hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
  294. if (unlikely(p->pe_data && p->pe->ct_match)) {
  295. if (cp->ipvs != p->ipvs)
  296. continue;
  297. if (p->pe == cp->pe && p->pe->ct_match(p, cp)) {
  298. if (__ip_vs_conn_get(cp))
  299. goto out;
  300. }
  301. continue;
  302. }
  303. if (cp->af == p->af &&
  304. ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
  305. /* protocol should only be IPPROTO_IP if
  306. * p->vaddr is a fwmark */
  307. ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
  308. p->af, p->vaddr, &cp->vaddr) &&
  309. p->vport == cp->vport && p->cport == cp->cport &&
  310. cp->flags & IP_VS_CONN_F_TEMPLATE &&
  311. p->protocol == cp->protocol &&
  312. cp->ipvs == p->ipvs) {
  313. if (__ip_vs_conn_get(cp))
  314. goto out;
  315. }
  316. }
  317. cp = NULL;
  318. out:
  319. rcu_read_unlock();
  320. IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
  321. ip_vs_proto_name(p->protocol),
  322. IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
  323. IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
  324. cp ? "hit" : "not hit");
  325. return cp;
  326. }
  327. /* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
  328. * Called for pkts coming from inside-to-OUTside.
  329. * p->caddr, p->cport: pkt source address (inside host)
  330. * p->vaddr, p->vport: pkt dest address (foreign host) */
  331. struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
  332. {
  333. unsigned int hash;
  334. struct ip_vs_conn *cp, *ret=NULL;
  335. /*
  336. * Check for "full" addressed entries
  337. */
  338. hash = ip_vs_conn_hashkey_param(p, true);
  339. rcu_read_lock();
  340. hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
  341. if (p->vport == cp->cport && p->cport == cp->dport &&
  342. cp->af == p->af &&
  343. ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
  344. ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
  345. p->protocol == cp->protocol &&
  346. cp->ipvs == p->ipvs) {
  347. if (!__ip_vs_conn_get(cp))
  348. continue;
  349. /* HIT */
  350. ret = cp;
  351. break;
  352. }
  353. }
  354. rcu_read_unlock();
  355. IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
  356. ip_vs_proto_name(p->protocol),
  357. IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
  358. IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
  359. ret ? "hit" : "not hit");
  360. return ret;
  361. }
  362. struct ip_vs_conn *
  363. ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af,
  364. const struct sk_buff *skb,
  365. const struct ip_vs_iphdr *iph)
  366. {
  367. struct ip_vs_conn_param p;
  368. if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p))
  369. return NULL;
  370. return ip_vs_conn_out_get(&p);
  371. }
  372. EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
  373. /*
  374. * Put back the conn and restart its timer with its timeout
  375. */
  376. void ip_vs_conn_put(struct ip_vs_conn *cp)
  377. {
  378. unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
  379. 0 : cp->timeout;
  380. mod_timer(&cp->timer, jiffies+t);
  381. __ip_vs_conn_put(cp);
  382. }
  383. /*
  384. * Fill a no_client_port connection with a client port number
  385. */
  386. void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
  387. {
  388. if (ip_vs_conn_unhash(cp)) {
  389. spin_lock_bh(&cp->lock);
  390. if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
  391. atomic_dec(&ip_vs_conn_no_cport_cnt);
  392. cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
  393. cp->cport = cport;
  394. }
  395. spin_unlock_bh(&cp->lock);
  396. /* hash on new dport */
  397. ip_vs_conn_hash(cp);
  398. }
  399. }
  400. /*
  401. * Bind a connection entry with the corresponding packet_xmit.
  402. * Called by ip_vs_conn_new.
  403. */
  404. static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
  405. {
  406. switch (IP_VS_FWD_METHOD(cp)) {
  407. case IP_VS_CONN_F_MASQ:
  408. cp->packet_xmit = ip_vs_nat_xmit;
  409. break;
  410. case IP_VS_CONN_F_TUNNEL:
  411. #ifdef CONFIG_IP_VS_IPV6
  412. if (cp->daf == AF_INET6)
  413. cp->packet_xmit = ip_vs_tunnel_xmit_v6;
  414. else
  415. #endif
  416. cp->packet_xmit = ip_vs_tunnel_xmit;
  417. break;
  418. case IP_VS_CONN_F_DROUTE:
  419. cp->packet_xmit = ip_vs_dr_xmit;
  420. break;
  421. case IP_VS_CONN_F_LOCALNODE:
  422. cp->packet_xmit = ip_vs_null_xmit;
  423. break;
  424. case IP_VS_CONN_F_BYPASS:
  425. cp->packet_xmit = ip_vs_bypass_xmit;
  426. break;
  427. }
  428. }
  429. #ifdef CONFIG_IP_VS_IPV6
  430. static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp)
  431. {
  432. switch (IP_VS_FWD_METHOD(cp)) {
  433. case IP_VS_CONN_F_MASQ:
  434. cp->packet_xmit = ip_vs_nat_xmit_v6;
  435. break;
  436. case IP_VS_CONN_F_TUNNEL:
  437. if (cp->daf == AF_INET6)
  438. cp->packet_xmit = ip_vs_tunnel_xmit_v6;
  439. else
  440. cp->packet_xmit = ip_vs_tunnel_xmit;
  441. break;
  442. case IP_VS_CONN_F_DROUTE:
  443. cp->packet_xmit = ip_vs_dr_xmit_v6;
  444. break;
  445. case IP_VS_CONN_F_LOCALNODE:
  446. cp->packet_xmit = ip_vs_null_xmit;
  447. break;
  448. case IP_VS_CONN_F_BYPASS:
  449. cp->packet_xmit = ip_vs_bypass_xmit_v6;
  450. break;
  451. }
  452. }
  453. #endif
  454. static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
  455. {
  456. return atomic_read(&dest->activeconns)
  457. + atomic_read(&dest->inactconns);
  458. }
  459. /*
  460. * Bind a connection entry with a virtual service destination
  461. * Called just after a new connection entry is created.
  462. */
  463. static inline void
  464. ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
  465. {
  466. unsigned int conn_flags;
  467. __u32 flags;
  468. /* if dest is NULL, then return directly */
  469. if (!dest)
  470. return;
  471. /* Increase the refcnt counter of the dest */
  472. ip_vs_dest_hold(dest);
  473. conn_flags = atomic_read(&dest->conn_flags);
  474. if (cp->protocol != IPPROTO_UDP)
  475. conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
  476. flags = cp->flags;
  477. /* Bind with the destination and its corresponding transmitter */
  478. if (flags & IP_VS_CONN_F_SYNC) {
  479. /* if the connection is not template and is created
  480. * by sync, preserve the activity flag.
  481. */
  482. if (!(flags & IP_VS_CONN_F_TEMPLATE))
  483. conn_flags &= ~IP_VS_CONN_F_INACTIVE;
  484. /* connections inherit forwarding method from dest */
  485. flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT);
  486. }
  487. flags |= conn_flags;
  488. cp->flags = flags;
  489. cp->dest = dest;
  490. IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
  491. "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
  492. "dest->refcnt:%d\n",
  493. ip_vs_proto_name(cp->protocol),
  494. IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
  495. IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
  496. IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
  497. ip_vs_fwd_tag(cp), cp->state,
  498. cp->flags, atomic_read(&cp->refcnt),
  499. atomic_read(&dest->refcnt));
  500. /* Update the connection counters */
  501. if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
  502. /* It is a normal connection, so modify the counters
  503. * according to the flags, later the protocol can
  504. * update them on state change
  505. */
  506. if (!(flags & IP_VS_CONN_F_INACTIVE))
  507. atomic_inc(&dest->activeconns);
  508. else
  509. atomic_inc(&dest->inactconns);
  510. } else {
  511. /* It is a persistent connection/template, so increase
  512. the persistent connection counter */
  513. atomic_inc(&dest->persistconns);
  514. }
  515. if (dest->u_threshold != 0 &&
  516. ip_vs_dest_totalconns(dest) >= dest->u_threshold)
  517. dest->flags |= IP_VS_DEST_F_OVERLOAD;
  518. }
  519. /*
  520. * Check if there is a destination for the connection, if so
  521. * bind the connection to the destination.
  522. */
  523. void ip_vs_try_bind_dest(struct ip_vs_conn *cp)
  524. {
  525. struct ip_vs_dest *dest;
  526. rcu_read_lock();
  527. /* This function is only invoked by the synchronization code. We do
  528. * not currently support heterogeneous pools with synchronization,
  529. * so we can make the assumption that the svc_af is the same as the
  530. * dest_af
  531. */
  532. dest = ip_vs_find_dest(cp->ipvs, cp->af, cp->af, &cp->daddr,
  533. cp->dport, &cp->vaddr, cp->vport,
  534. cp->protocol, cp->fwmark, cp->flags);
  535. if (dest) {
  536. struct ip_vs_proto_data *pd;
  537. spin_lock_bh(&cp->lock);
  538. if (cp->dest) {
  539. spin_unlock_bh(&cp->lock);
  540. rcu_read_unlock();
  541. return;
  542. }
  543. /* Applications work depending on the forwarding method
  544. * but better to reassign them always when binding dest */
  545. if (cp->app)
  546. ip_vs_unbind_app(cp);
  547. ip_vs_bind_dest(cp, dest);
  548. spin_unlock_bh(&cp->lock);
  549. /* Update its packet transmitter */
  550. cp->packet_xmit = NULL;
  551. #ifdef CONFIG_IP_VS_IPV6
  552. if (cp->af == AF_INET6)
  553. ip_vs_bind_xmit_v6(cp);
  554. else
  555. #endif
  556. ip_vs_bind_xmit(cp);
  557. pd = ip_vs_proto_data_get(cp->ipvs, cp->protocol);
  558. if (pd && atomic_read(&pd->appcnt))
  559. ip_vs_bind_app(cp, pd->pp);
  560. }
  561. rcu_read_unlock();
  562. }
  563. /*
  564. * Unbind a connection entry with its VS destination
  565. * Called by the ip_vs_conn_expire function.
  566. */
  567. static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
  568. {
  569. struct ip_vs_dest *dest = cp->dest;
  570. if (!dest)
  571. return;
  572. IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d "
  573. "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
  574. "dest->refcnt:%d\n",
  575. ip_vs_proto_name(cp->protocol),
  576. IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
  577. IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
  578. IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
  579. ip_vs_fwd_tag(cp), cp->state,
  580. cp->flags, atomic_read(&cp->refcnt),
  581. atomic_read(&dest->refcnt));
  582. /* Update the connection counters */
  583. if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
  584. /* It is a normal connection, so decrease the inactconns
  585. or activeconns counter */
  586. if (cp->flags & IP_VS_CONN_F_INACTIVE) {
  587. atomic_dec(&dest->inactconns);
  588. } else {
  589. atomic_dec(&dest->activeconns);
  590. }
  591. } else {
  592. /* It is a persistent connection/template, so decrease
  593. the persistent connection counter */
  594. atomic_dec(&dest->persistconns);
  595. }
  596. if (dest->l_threshold != 0) {
  597. if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
  598. dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
  599. } else if (dest->u_threshold != 0) {
  600. if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
  601. dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
  602. } else {
  603. if (dest->flags & IP_VS_DEST_F_OVERLOAD)
  604. dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
  605. }
  606. ip_vs_dest_put(dest);
  607. }
  608. static int expire_quiescent_template(struct netns_ipvs *ipvs,
  609. struct ip_vs_dest *dest)
  610. {
  611. #ifdef CONFIG_SYSCTL
  612. return ipvs->sysctl_expire_quiescent_template &&
  613. (atomic_read(&dest->weight) == 0);
  614. #else
  615. return 0;
  616. #endif
  617. }
  618. /*
  619. * Checking if the destination of a connection template is available.
  620. * If available, return 1, otherwise invalidate this connection
  621. * template and return 0.
  622. */
  623. int ip_vs_check_template(struct ip_vs_conn *ct)
  624. {
  625. struct ip_vs_dest *dest = ct->dest;
  626. struct netns_ipvs *ipvs = ct->ipvs;
  627. /*
  628. * Checking the dest server status.
  629. */
  630. if ((dest == NULL) ||
  631. !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
  632. expire_quiescent_template(ipvs, dest)) {
  633. IP_VS_DBG_BUF(9, "check_template: dest not available for "
  634. "protocol %s s:%s:%d v:%s:%d "
  635. "-> d:%s:%d\n",
  636. ip_vs_proto_name(ct->protocol),
  637. IP_VS_DBG_ADDR(ct->af, &ct->caddr),
  638. ntohs(ct->cport),
  639. IP_VS_DBG_ADDR(ct->af, &ct->vaddr),
  640. ntohs(ct->vport),
  641. IP_VS_DBG_ADDR(ct->daf, &ct->daddr),
  642. ntohs(ct->dport));
  643. /*
  644. * Invalidate the connection template
  645. */
  646. if (ct->vport != htons(0xffff)) {
  647. if (ip_vs_conn_unhash(ct)) {
  648. ct->dport = htons(0xffff);
  649. ct->vport = htons(0xffff);
  650. ct->cport = 0;
  651. ip_vs_conn_hash(ct);
  652. }
  653. }
  654. /*
  655. * Simply decrease the refcnt of the template,
  656. * don't restart its timer.
  657. */
  658. __ip_vs_conn_put(ct);
  659. return 0;
  660. }
  661. return 1;
  662. }
  663. static void ip_vs_conn_rcu_free(struct rcu_head *head)
  664. {
  665. struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn,
  666. rcu_head);
  667. ip_vs_pe_put(cp->pe);
  668. kfree(cp->pe_data);
  669. kmem_cache_free(ip_vs_conn_cachep, cp);
  670. }
  671. static void ip_vs_conn_expire(unsigned long data)
  672. {
  673. struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
  674. struct netns_ipvs *ipvs = cp->ipvs;
  675. /*
  676. * do I control anybody?
  677. */
  678. if (atomic_read(&cp->n_control))
  679. goto expire_later;
  680. /* Unlink conn if not referenced anymore */
  681. if (likely(ip_vs_conn_unlink(cp))) {
  682. /* delete the timer if it is activated by other users */
  683. del_timer(&cp->timer);
  684. /* does anybody control me? */
  685. if (cp->control)
  686. ip_vs_control_del(cp);
  687. if (cp->flags & IP_VS_CONN_F_NFCT) {
  688. /* Do not access conntracks during subsys cleanup
  689. * because nf_conntrack_find_get can not be used after
  690. * conntrack cleanup for the net.
  691. */
  692. smp_rmb();
  693. if (ipvs->enable)
  694. ip_vs_conn_drop_conntrack(cp);
  695. }
  696. if (unlikely(cp->app != NULL))
  697. ip_vs_unbind_app(cp);
  698. ip_vs_unbind_dest(cp);
  699. if (cp->flags & IP_VS_CONN_F_NO_CPORT)
  700. atomic_dec(&ip_vs_conn_no_cport_cnt);
  701. call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
  702. atomic_dec(&ipvs->conn_count);
  703. return;
  704. }
  705. expire_later:
  706. IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n",
  707. atomic_read(&cp->refcnt),
  708. atomic_read(&cp->n_control));
  709. atomic_inc(&cp->refcnt);
  710. cp->timeout = 60*HZ;
  711. if (ipvs->sync_state & IP_VS_STATE_MASTER)
  712. ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs));
  713. ip_vs_conn_put(cp);
  714. }
  715. /* Modify timer, so that it expires as soon as possible.
  716. * Can be called without reference only if under RCU lock.
  717. */
  718. void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
  719. {
  720. /* Using mod_timer_pending will ensure the timer is not
  721. * modified after the final del_timer in ip_vs_conn_expire.
  722. */
  723. if (timer_pending(&cp->timer) &&
  724. time_after(cp->timer.expires, jiffies))
  725. mod_timer_pending(&cp->timer, jiffies);
  726. }
  727. /*
  728. * Create a new connection entry and hash it into the ip_vs_conn_tab
  729. */
  730. struct ip_vs_conn *
  731. ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
  732. const union nf_inet_addr *daddr, __be16 dport, unsigned int flags,
  733. struct ip_vs_dest *dest, __u32 fwmark)
  734. {
  735. struct ip_vs_conn *cp;
  736. struct netns_ipvs *ipvs = p->ipvs;
  737. struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->ipvs,
  738. p->protocol);
  739. cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
  740. if (cp == NULL) {
  741. IP_VS_ERR_RL("%s(): no memory\n", __func__);
  742. return NULL;
  743. }
  744. INIT_HLIST_NODE(&cp->c_list);
  745. setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
  746. cp->ipvs = ipvs;
  747. cp->af = p->af;
  748. cp->daf = dest_af;
  749. cp->protocol = p->protocol;
  750. ip_vs_addr_set(p->af, &cp->caddr, p->caddr);
  751. cp->cport = p->cport;
  752. /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */
  753. ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
  754. &cp->vaddr, p->vaddr);
  755. cp->vport = p->vport;
  756. ip_vs_addr_set(cp->daf, &cp->daddr, daddr);
  757. cp->dport = dport;
  758. cp->flags = flags;
  759. cp->fwmark = fwmark;
  760. if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
  761. ip_vs_pe_get(p->pe);
  762. cp->pe = p->pe;
  763. cp->pe_data = p->pe_data;
  764. cp->pe_data_len = p->pe_data_len;
  765. } else {
  766. cp->pe = NULL;
  767. cp->pe_data = NULL;
  768. cp->pe_data_len = 0;
  769. }
  770. spin_lock_init(&cp->lock);
  771. /*
  772. * Set the entry is referenced by the current thread before hashing
  773. * it in the table, so that other thread run ip_vs_random_dropentry
  774. * but cannot drop this entry.
  775. */
  776. atomic_set(&cp->refcnt, 1);
  777. cp->control = NULL;
  778. atomic_set(&cp->n_control, 0);
  779. atomic_set(&cp->in_pkts, 0);
  780. cp->packet_xmit = NULL;
  781. cp->app = NULL;
  782. cp->app_data = NULL;
  783. /* reset struct ip_vs_seq */
  784. cp->in_seq.delta = 0;
  785. cp->out_seq.delta = 0;
  786. atomic_inc(&ipvs->conn_count);
  787. if (flags & IP_VS_CONN_F_NO_CPORT)
  788. atomic_inc(&ip_vs_conn_no_cport_cnt);
  789. /* Bind the connection with a destination server */
  790. cp->dest = NULL;
  791. ip_vs_bind_dest(cp, dest);
  792. /* Set its state and timeout */
  793. cp->state = 0;
  794. cp->old_state = 0;
  795. cp->timeout = 3*HZ;
  796. cp->sync_endtime = jiffies & ~3UL;
  797. /* Bind its packet transmitter */
  798. #ifdef CONFIG_IP_VS_IPV6
  799. if (p->af == AF_INET6)
  800. ip_vs_bind_xmit_v6(cp);
  801. else
  802. #endif
  803. ip_vs_bind_xmit(cp);
  804. if (unlikely(pd && atomic_read(&pd->appcnt)))
  805. ip_vs_bind_app(cp, pd->pp);
  806. /*
  807. * Allow conntrack to be preserved. By default, conntrack
  808. * is created and destroyed for every packet.
  809. * Sometimes keeping conntrack can be useful for
  810. * IP_VS_CONN_F_ONE_PACKET too.
  811. */
  812. if (ip_vs_conntrack_enabled(ipvs))
  813. cp->flags |= IP_VS_CONN_F_NFCT;
  814. /* Hash it in the ip_vs_conn_tab finally */
  815. ip_vs_conn_hash(cp);
  816. return cp;
  817. }
  818. /*
  819. * /proc/net/ip_vs_conn entries
  820. */
  821. #ifdef CONFIG_PROC_FS
  822. struct ip_vs_iter_state {
  823. struct seq_net_private p;
  824. struct hlist_head *l;
  825. };
  826. static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
  827. {
  828. int idx;
  829. struct ip_vs_conn *cp;
  830. struct ip_vs_iter_state *iter = seq->private;
  831. for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
  832. hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
  833. /* __ip_vs_conn_get() is not needed by
  834. * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
  835. */
  836. if (pos-- == 0) {
  837. iter->l = &ip_vs_conn_tab[idx];
  838. return cp;
  839. }
  840. }
  841. cond_resched_rcu();
  842. }
  843. return NULL;
  844. }
  845. static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
  846. __acquires(RCU)
  847. {
  848. struct ip_vs_iter_state *iter = seq->private;
  849. iter->l = NULL;
  850. rcu_read_lock();
  851. return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
  852. }
  853. static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  854. {
  855. struct ip_vs_conn *cp = v;
  856. struct ip_vs_iter_state *iter = seq->private;
  857. struct hlist_node *e;
  858. struct hlist_head *l = iter->l;
  859. int idx;
  860. ++*pos;
  861. if (v == SEQ_START_TOKEN)
  862. return ip_vs_conn_array(seq, 0);
  863. /* more on same hash chain? */
  864. e = rcu_dereference(hlist_next_rcu(&cp->c_list));
  865. if (e)
  866. return hlist_entry(e, struct ip_vs_conn, c_list);
  867. idx = l - ip_vs_conn_tab;
  868. while (++idx < ip_vs_conn_tab_size) {
  869. hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
  870. iter->l = &ip_vs_conn_tab[idx];
  871. return cp;
  872. }
  873. cond_resched_rcu();
  874. }
  875. iter->l = NULL;
  876. return NULL;
  877. }
  878. static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
  879. __releases(RCU)
  880. {
  881. rcu_read_unlock();
  882. }
  883. static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
  884. {
  885. if (v == SEQ_START_TOKEN)
  886. seq_puts(seq,
  887. "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n");
  888. else {
  889. const struct ip_vs_conn *cp = v;
  890. struct net *net = seq_file_net(seq);
  891. char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
  892. size_t len = 0;
  893. char dbuf[IP_VS_ADDRSTRLEN];
  894. if (!net_eq(cp->ipvs->net, net))
  895. return 0;
  896. if (cp->pe_data) {
  897. pe_data[0] = ' ';
  898. len = strlen(cp->pe->name);
  899. memcpy(pe_data + 1, cp->pe->name, len);
  900. pe_data[len + 1] = ' ';
  901. len += 2;
  902. len += cp->pe->show_pe_data(cp, pe_data + len);
  903. }
  904. pe_data[len] = '\0';
  905. #ifdef CONFIG_IP_VS_IPV6
  906. if (cp->daf == AF_INET6)
  907. snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6);
  908. else
  909. #endif
  910. snprintf(dbuf, sizeof(dbuf), "%08X",
  911. ntohl(cp->daddr.ip));
  912. #ifdef CONFIG_IP_VS_IPV6
  913. if (cp->af == AF_INET6)
  914. seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
  915. "%s %04X %-11s %7lu%s\n",
  916. ip_vs_proto_name(cp->protocol),
  917. &cp->caddr.in6, ntohs(cp->cport),
  918. &cp->vaddr.in6, ntohs(cp->vport),
  919. dbuf, ntohs(cp->dport),
  920. ip_vs_state_name(cp->protocol, cp->state),
  921. (cp->timer.expires-jiffies)/HZ, pe_data);
  922. else
  923. #endif
  924. seq_printf(seq,
  925. "%-3s %08X %04X %08X %04X"
  926. " %s %04X %-11s %7lu%s\n",
  927. ip_vs_proto_name(cp->protocol),
  928. ntohl(cp->caddr.ip), ntohs(cp->cport),
  929. ntohl(cp->vaddr.ip), ntohs(cp->vport),
  930. dbuf, ntohs(cp->dport),
  931. ip_vs_state_name(cp->protocol, cp->state),
  932. (cp->timer.expires-jiffies)/HZ, pe_data);
  933. }
  934. return 0;
  935. }
  936. static const struct seq_operations ip_vs_conn_seq_ops = {
  937. .start = ip_vs_conn_seq_start,
  938. .next = ip_vs_conn_seq_next,
  939. .stop = ip_vs_conn_seq_stop,
  940. .show = ip_vs_conn_seq_show,
  941. };
  942. static int ip_vs_conn_open(struct inode *inode, struct file *file)
  943. {
  944. return seq_open_net(inode, file, &ip_vs_conn_seq_ops,
  945. sizeof(struct ip_vs_iter_state));
  946. }
  947. static const struct file_operations ip_vs_conn_fops = {
  948. .owner = THIS_MODULE,
  949. .open = ip_vs_conn_open,
  950. .read = seq_read,
  951. .llseek = seq_lseek,
  952. .release = seq_release_net,
  953. };
  954. static const char *ip_vs_origin_name(unsigned int flags)
  955. {
  956. if (flags & IP_VS_CONN_F_SYNC)
  957. return "SYNC";
  958. else
  959. return "LOCAL";
  960. }
  961. static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
  962. {
  963. char dbuf[IP_VS_ADDRSTRLEN];
  964. if (v == SEQ_START_TOKEN)
  965. seq_puts(seq,
  966. "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n");
  967. else {
  968. const struct ip_vs_conn *cp = v;
  969. struct net *net = seq_file_net(seq);
  970. if (!net_eq(cp->ipvs->net, net))
  971. return 0;
  972. #ifdef CONFIG_IP_VS_IPV6
  973. if (cp->daf == AF_INET6)
  974. snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6);
  975. else
  976. #endif
  977. snprintf(dbuf, sizeof(dbuf), "%08X",
  978. ntohl(cp->daddr.ip));
  979. #ifdef CONFIG_IP_VS_IPV6
  980. if (cp->af == AF_INET6)
  981. seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
  982. "%s %04X %-11s %-6s %7lu\n",
  983. ip_vs_proto_name(cp->protocol),
  984. &cp->caddr.in6, ntohs(cp->cport),
  985. &cp->vaddr.in6, ntohs(cp->vport),
  986. dbuf, ntohs(cp->dport),
  987. ip_vs_state_name(cp->protocol, cp->state),
  988. ip_vs_origin_name(cp->flags),
  989. (cp->timer.expires-jiffies)/HZ);
  990. else
  991. #endif
  992. seq_printf(seq,
  993. "%-3s %08X %04X %08X %04X "
  994. "%s %04X %-11s %-6s %7lu\n",
  995. ip_vs_proto_name(cp->protocol),
  996. ntohl(cp->caddr.ip), ntohs(cp->cport),
  997. ntohl(cp->vaddr.ip), ntohs(cp->vport),
  998. dbuf, ntohs(cp->dport),
  999. ip_vs_state_name(cp->protocol, cp->state),
  1000. ip_vs_origin_name(cp->flags),
  1001. (cp->timer.expires-jiffies)/HZ);
  1002. }
  1003. return 0;
  1004. }
  1005. static const struct seq_operations ip_vs_conn_sync_seq_ops = {
  1006. .start = ip_vs_conn_seq_start,
  1007. .next = ip_vs_conn_seq_next,
  1008. .stop = ip_vs_conn_seq_stop,
  1009. .show = ip_vs_conn_sync_seq_show,
  1010. };
  1011. static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
  1012. {
  1013. return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops,
  1014. sizeof(struct ip_vs_iter_state));
  1015. }
  1016. static const struct file_operations ip_vs_conn_sync_fops = {
  1017. .owner = THIS_MODULE,
  1018. .open = ip_vs_conn_sync_open,
  1019. .read = seq_read,
  1020. .llseek = seq_lseek,
  1021. .release = seq_release_net,
  1022. };
  1023. #endif
  1024. /*
  1025. * Randomly drop connection entries before running out of memory
  1026. */
  1027. static inline int todrop_entry(struct ip_vs_conn *cp)
  1028. {
  1029. /*
  1030. * The drop rate array needs tuning for real environments.
  1031. * Called from timer bh only => no locking
  1032. */
  1033. static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
  1034. static char todrop_counter[9] = {0};
  1035. int i;
  1036. /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
  1037. This will leave enough time for normal connection to get
  1038. through. */
  1039. if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
  1040. return 0;
  1041. /* Don't drop the entry if its number of incoming packets is not
  1042. located in [0, 8] */
  1043. i = atomic_read(&cp->in_pkts);
  1044. if (i > 8 || i < 0) return 0;
  1045. if (!todrop_rate[i]) return 0;
  1046. if (--todrop_counter[i] > 0) return 0;
  1047. todrop_counter[i] = todrop_rate[i];
  1048. return 1;
  1049. }
  1050. /* Called from keventd and must protect itself from softirqs */
  1051. void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
  1052. {
  1053. int idx;
  1054. struct ip_vs_conn *cp, *cp_c;
  1055. rcu_read_lock();
  1056. /*
  1057. * Randomly scan 1/32 of the whole table every second
  1058. */
  1059. for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
  1060. unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask;
  1061. hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
  1062. if (cp->flags & IP_VS_CONN_F_TEMPLATE)
  1063. /* connection template */
  1064. continue;
  1065. if (cp->ipvs != ipvs)
  1066. continue;
  1067. if (cp->protocol == IPPROTO_TCP) {
  1068. switch(cp->state) {
  1069. case IP_VS_TCP_S_SYN_RECV:
  1070. case IP_VS_TCP_S_SYNACK:
  1071. break;
  1072. case IP_VS_TCP_S_ESTABLISHED:
  1073. if (todrop_entry(cp))
  1074. break;
  1075. continue;
  1076. default:
  1077. continue;
  1078. }
  1079. } else if (cp->protocol == IPPROTO_SCTP) {
  1080. switch (cp->state) {
  1081. case IP_VS_SCTP_S_INIT1:
  1082. case IP_VS_SCTP_S_INIT:
  1083. break;
  1084. case IP_VS_SCTP_S_ESTABLISHED:
  1085. if (todrop_entry(cp))
  1086. break;
  1087. continue;
  1088. default:
  1089. continue;
  1090. }
  1091. } else {
  1092. if (!todrop_entry(cp))
  1093. continue;
  1094. }
  1095. IP_VS_DBG(4, "del connection\n");
  1096. ip_vs_conn_expire_now(cp);
  1097. cp_c = cp->control;
  1098. /* cp->control is valid only with reference to cp */
  1099. if (cp_c && __ip_vs_conn_get(cp)) {
  1100. IP_VS_DBG(4, "del conn template\n");
  1101. ip_vs_conn_expire_now(cp_c);
  1102. __ip_vs_conn_put(cp);
  1103. }
  1104. }
  1105. cond_resched_rcu();
  1106. }
  1107. rcu_read_unlock();
  1108. }
  1109. /*
  1110. * Flush all the connection entries in the ip_vs_conn_tab
  1111. */
  1112. static void ip_vs_conn_flush(struct netns_ipvs *ipvs)
  1113. {
  1114. int idx;
  1115. struct ip_vs_conn *cp, *cp_c;
  1116. flush_again:
  1117. rcu_read_lock();
  1118. for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
  1119. hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
  1120. if (cp->ipvs != ipvs)
  1121. continue;
  1122. IP_VS_DBG(4, "del connection\n");
  1123. ip_vs_conn_expire_now(cp);
  1124. cp_c = cp->control;
  1125. /* cp->control is valid only with reference to cp */
  1126. if (cp_c && __ip_vs_conn_get(cp)) {
  1127. IP_VS_DBG(4, "del conn template\n");
  1128. ip_vs_conn_expire_now(cp_c);
  1129. __ip_vs_conn_put(cp);
  1130. }
  1131. }
  1132. cond_resched_rcu();
  1133. }
  1134. rcu_read_unlock();
  1135. /* the counter may be not NULL, because maybe some conn entries
  1136. are run by slow timer handler or unhashed but still referred */
  1137. if (atomic_read(&ipvs->conn_count) != 0) {
  1138. schedule();
  1139. goto flush_again;
  1140. }
  1141. }
  1142. /*
  1143. * per netns init and exit
  1144. */
  1145. int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs)
  1146. {
  1147. atomic_set(&ipvs->conn_count, 0);
  1148. proc_create("ip_vs_conn", 0, ipvs->net->proc_net, &ip_vs_conn_fops);
  1149. proc_create("ip_vs_conn_sync", 0, ipvs->net->proc_net,
  1150. &ip_vs_conn_sync_fops);
  1151. return 0;
  1152. }
  1153. void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs)
  1154. {
  1155. /* flush all the connection entries first */
  1156. ip_vs_conn_flush(ipvs);
  1157. remove_proc_entry("ip_vs_conn", ipvs->net->proc_net);
  1158. remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net);
  1159. }
  1160. int __init ip_vs_conn_init(void)
  1161. {
  1162. int idx;
  1163. /* Compute size and mask */
  1164. ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
  1165. ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
  1166. /*
  1167. * Allocate the connection hash table and initialize its list heads
  1168. */
  1169. ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab));
  1170. if (!ip_vs_conn_tab)
  1171. return -ENOMEM;
  1172. /* Allocate ip_vs_conn slab cache */
  1173. ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
  1174. sizeof(struct ip_vs_conn), 0,
  1175. SLAB_HWCACHE_ALIGN, NULL);
  1176. if (!ip_vs_conn_cachep) {
  1177. vfree(ip_vs_conn_tab);
  1178. return -ENOMEM;
  1179. }
  1180. pr_info("Connection hash table configured "
  1181. "(size=%d, memory=%ldKbytes)\n",
  1182. ip_vs_conn_tab_size,
  1183. (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
  1184. IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
  1185. sizeof(struct ip_vs_conn));
  1186. for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
  1187. INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
  1188. for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
  1189. spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l);
  1190. }
  1191. /* calculate the random value for connection hash */
  1192. get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
  1193. return 0;
  1194. }
  1195. void ip_vs_conn_cleanup(void)
  1196. {
  1197. /* Wait all ip_vs_conn_rcu_free() callbacks to complete */
  1198. rcu_barrier();
  1199. /* Release the empty cache */
  1200. kmem_cache_destroy(ip_vs_conn_cachep);
  1201. vfree(ip_vs_conn_tab);
  1202. }