ip_vs_lblcr.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817
  1. /*
  2. * IPVS: Locality-Based Least-Connection with Replication scheduler
  3. *
  4. * Authors: Wensong Zhang <wensong@gnuchina.org>
  5. *
  6. * This program is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU General Public License
  8. * as published by the Free Software Foundation; either version
  9. * 2 of the License, or (at your option) any later version.
  10. *
  11. * Changes:
  12. * Julian Anastasov : Added the missing (dest->weight>0)
  13. * condition in the ip_vs_dest_set_max.
  14. *
  15. */
  16. /*
  17. * The lblc/r algorithm is as follows (pseudo code):
  18. *
  19. * if serverSet[dest_ip] is null then
  20. * n, serverSet[dest_ip] <- {weighted least-conn node};
  21. * else
  22. * n <- {least-conn (alive) node in serverSet[dest_ip]};
  23. * if (n is null) OR
  24. * (n.conns>n.weight AND
  25. * there is a node m with m.conns<m.weight/2) then
  26. * n <- {weighted least-conn node};
  27. * add n to serverSet[dest_ip];
  28. * if |serverSet[dest_ip]| > 1 AND
  29. * now - serverSet[dest_ip].lastMod > T then
  30. * m <- {most conn node in serverSet[dest_ip]};
  31. * remove m from serverSet[dest_ip];
  32. * if serverSet[dest_ip] changed then
  33. * serverSet[dest_ip].lastMod <- now;
  34. *
  35. * return n;
  36. *
  37. */
  38. #define KMSG_COMPONENT "IPVS"
  39. #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  40. #include <linux/ip.h>
  41. #include <linux/module.h>
  42. #include <linux/kernel.h>
  43. #include <linux/skbuff.h>
  44. #include <linux/jiffies.h>
  45. #include <linux/list.h>
  46. #include <linux/slab.h>
  47. /* for sysctl */
  48. #include <linux/fs.h>
  49. #include <linux/sysctl.h>
  50. #include <net/net_namespace.h>
  51. #include <net/ip_vs.h>
  52. /*
  53. * It is for garbage collection of stale IPVS lblcr entries,
  54. * when the table is full.
  55. */
  56. #define CHECK_EXPIRE_INTERVAL (60*HZ)
  57. #define ENTRY_TIMEOUT (6*60*HZ)
  58. #define DEFAULT_EXPIRATION (24*60*60*HZ)
  59. /*
  60. * It is for full expiration check.
  61. * When there is no partial expiration check (garbage collection)
  62. * in a half hour, do a full expiration check to collect stale
  63. * entries that haven't been touched for a day.
  64. */
  65. #define COUNT_FOR_FULL_EXPIRATION 30
  66. /*
  67. * for IPVS lblcr entry hash table
  68. */
  69. #ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
  70. #define CONFIG_IP_VS_LBLCR_TAB_BITS 10
  71. #endif
  72. #define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS
  73. #define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS)
  74. #define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1)
  75. /*
  76. * IPVS destination set structure and operations
  77. */
  78. struct ip_vs_dest_set_elem {
  79. struct list_head list; /* list link */
  80. struct ip_vs_dest *dest; /* destination server */
  81. struct rcu_head rcu_head;
  82. };
  83. struct ip_vs_dest_set {
  84. atomic_t size; /* set size */
  85. unsigned long lastmod; /* last modified time */
  86. struct list_head list; /* destination list */
  87. };
  88. static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set,
  89. struct ip_vs_dest *dest, bool check)
  90. {
  91. struct ip_vs_dest_set_elem *e;
  92. if (check) {
  93. list_for_each_entry(e, &set->list, list) {
  94. if (e->dest == dest)
  95. return;
  96. }
  97. }
  98. e = kmalloc(sizeof(*e), GFP_ATOMIC);
  99. if (e == NULL)
  100. return;
  101. ip_vs_dest_hold(dest);
  102. e->dest = dest;
  103. list_add_rcu(&e->list, &set->list);
  104. atomic_inc(&set->size);
  105. set->lastmod = jiffies;
  106. }
  107. static void ip_vs_lblcr_elem_rcu_free(struct rcu_head *head)
  108. {
  109. struct ip_vs_dest_set_elem *e;
  110. e = container_of(head, struct ip_vs_dest_set_elem, rcu_head);
  111. ip_vs_dest_put_and_free(e->dest);
  112. kfree(e);
  113. }
  114. static void
  115. ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
  116. {
  117. struct ip_vs_dest_set_elem *e;
  118. list_for_each_entry(e, &set->list, list) {
  119. if (e->dest == dest) {
  120. /* HIT */
  121. atomic_dec(&set->size);
  122. set->lastmod = jiffies;
  123. list_del_rcu(&e->list);
  124. call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free);
  125. break;
  126. }
  127. }
  128. }
  129. static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
  130. {
  131. struct ip_vs_dest_set_elem *e, *ep;
  132. list_for_each_entry_safe(e, ep, &set->list, list) {
  133. list_del_rcu(&e->list);
  134. call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free);
  135. }
  136. }
  137. /* get weighted least-connection node in the destination set */
  138. static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
  139. {
  140. register struct ip_vs_dest_set_elem *e;
  141. struct ip_vs_dest *dest, *least;
  142. int loh, doh;
  143. /* select the first destination server, whose weight > 0 */
  144. list_for_each_entry_rcu(e, &set->list, list) {
  145. least = e->dest;
  146. if (least->flags & IP_VS_DEST_F_OVERLOAD)
  147. continue;
  148. if ((atomic_read(&least->weight) > 0)
  149. && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
  150. loh = ip_vs_dest_conn_overhead(least);
  151. goto nextstage;
  152. }
  153. }
  154. return NULL;
  155. /* find the destination with the weighted least load */
  156. nextstage:
  157. list_for_each_entry_continue_rcu(e, &set->list, list) {
  158. dest = e->dest;
  159. if (dest->flags & IP_VS_DEST_F_OVERLOAD)
  160. continue;
  161. doh = ip_vs_dest_conn_overhead(dest);
  162. if (((__s64)loh * atomic_read(&dest->weight) >
  163. (__s64)doh * atomic_read(&least->weight))
  164. && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
  165. least = dest;
  166. loh = doh;
  167. }
  168. }
  169. IP_VS_DBG_BUF(6, "%s(): server %s:%d "
  170. "activeconns %d refcnt %d weight %d overhead %d\n",
  171. __func__,
  172. IP_VS_DBG_ADDR(least->af, &least->addr),
  173. ntohs(least->port),
  174. atomic_read(&least->activeconns),
  175. atomic_read(&least->refcnt),
  176. atomic_read(&least->weight), loh);
  177. return least;
  178. }
  179. /* get weighted most-connection node in the destination set */
  180. static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
  181. {
  182. register struct ip_vs_dest_set_elem *e;
  183. struct ip_vs_dest *dest, *most;
  184. int moh, doh;
  185. if (set == NULL)
  186. return NULL;
  187. /* select the first destination server, whose weight > 0 */
  188. list_for_each_entry(e, &set->list, list) {
  189. most = e->dest;
  190. if (atomic_read(&most->weight) > 0) {
  191. moh = ip_vs_dest_conn_overhead(most);
  192. goto nextstage;
  193. }
  194. }
  195. return NULL;
  196. /* find the destination with the weighted most load */
  197. nextstage:
  198. list_for_each_entry_continue(e, &set->list, list) {
  199. dest = e->dest;
  200. doh = ip_vs_dest_conn_overhead(dest);
  201. /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
  202. if (((__s64)moh * atomic_read(&dest->weight) <
  203. (__s64)doh * atomic_read(&most->weight))
  204. && (atomic_read(&dest->weight) > 0)) {
  205. most = dest;
  206. moh = doh;
  207. }
  208. }
  209. IP_VS_DBG_BUF(6, "%s(): server %s:%d "
  210. "activeconns %d refcnt %d weight %d overhead %d\n",
  211. __func__,
  212. IP_VS_DBG_ADDR(most->af, &most->addr), ntohs(most->port),
  213. atomic_read(&most->activeconns),
  214. atomic_read(&most->refcnt),
  215. atomic_read(&most->weight), moh);
  216. return most;
  217. }
  218. /*
  219. * IPVS lblcr entry represents an association between destination
  220. * IP address and its destination server set
  221. */
  222. struct ip_vs_lblcr_entry {
  223. struct hlist_node list;
  224. int af; /* address family */
  225. union nf_inet_addr addr; /* destination IP address */
  226. struct ip_vs_dest_set set; /* destination server set */
  227. unsigned long lastuse; /* last used time */
  228. struct rcu_head rcu_head;
  229. };
  230. /*
  231. * IPVS lblcr hash table
  232. */
  233. struct ip_vs_lblcr_table {
  234. struct rcu_head rcu_head;
  235. struct hlist_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
  236. atomic_t entries; /* number of entries */
  237. int max_size; /* maximum size of entries */
  238. struct timer_list periodic_timer; /* collect stale entries */
  239. int rover; /* rover for expire check */
  240. int counter; /* counter for no expire */
  241. bool dead;
  242. };
  243. #ifdef CONFIG_SYSCTL
  244. /*
  245. * IPVS LBLCR sysctl table
  246. */
  247. static struct ctl_table vs_vars_table[] = {
  248. {
  249. .procname = "lblcr_expiration",
  250. .data = NULL,
  251. .maxlen = sizeof(int),
  252. .mode = 0644,
  253. .proc_handler = proc_dointvec_jiffies,
  254. },
  255. { }
  256. };
  257. #endif
  258. static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
  259. {
  260. hlist_del_rcu(&en->list);
  261. ip_vs_dest_set_eraseall(&en->set);
  262. kfree_rcu(en, rcu_head);
  263. }
  264. /*
  265. * Returns hash value for IPVS LBLCR entry
  266. */
  267. static inline unsigned int
  268. ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
  269. {
  270. __be32 addr_fold = addr->ip;
  271. #ifdef CONFIG_IP_VS_IPV6
  272. if (af == AF_INET6)
  273. addr_fold = addr->ip6[0]^addr->ip6[1]^
  274. addr->ip6[2]^addr->ip6[3];
  275. #endif
  276. return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
  277. }
  278. /*
  279. * Hash an entry in the ip_vs_lblcr_table.
  280. * returns bool success.
  281. */
  282. static void
  283. ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
  284. {
  285. unsigned int hash = ip_vs_lblcr_hashkey(en->af, &en->addr);
  286. hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
  287. atomic_inc(&tbl->entries);
  288. }
  289. /* Get ip_vs_lblcr_entry associated with supplied parameters. */
  290. static inline struct ip_vs_lblcr_entry *
  291. ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,
  292. const union nf_inet_addr *addr)
  293. {
  294. unsigned int hash = ip_vs_lblcr_hashkey(af, addr);
  295. struct ip_vs_lblcr_entry *en;
  296. hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
  297. if (ip_vs_addr_equal(af, &en->addr, addr))
  298. return en;
  299. return NULL;
  300. }
  301. /*
  302. * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
  303. * IP address to a server. Called under spin lock.
  304. */
  305. static inline struct ip_vs_lblcr_entry *
  306. ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
  307. u16 af, struct ip_vs_dest *dest)
  308. {
  309. struct ip_vs_lblcr_entry *en;
  310. en = ip_vs_lblcr_get(af, tbl, daddr);
  311. if (!en) {
  312. en = kmalloc(sizeof(*en), GFP_ATOMIC);
  313. if (!en)
  314. return NULL;
  315. en->af = af;
  316. ip_vs_addr_copy(af, &en->addr, daddr);
  317. en->lastuse = jiffies;
  318. /* initialize its dest set */
  319. atomic_set(&(en->set.size), 0);
  320. INIT_LIST_HEAD(&en->set.list);
  321. ip_vs_dest_set_insert(&en->set, dest, false);
  322. ip_vs_lblcr_hash(tbl, en);
  323. return en;
  324. }
  325. ip_vs_dest_set_insert(&en->set, dest, true);
  326. return en;
  327. }
  328. /*
  329. * Flush all the entries of the specified table.
  330. */
  331. static void ip_vs_lblcr_flush(struct ip_vs_service *svc)
  332. {
  333. struct ip_vs_lblcr_table *tbl = svc->sched_data;
  334. int i;
  335. struct ip_vs_lblcr_entry *en;
  336. struct hlist_node *next;
  337. spin_lock_bh(&svc->sched_lock);
  338. tbl->dead = 1;
  339. for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) {
  340. hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
  341. ip_vs_lblcr_free(en);
  342. }
  343. }
  344. spin_unlock_bh(&svc->sched_lock);
  345. }
  346. static int sysctl_lblcr_expiration(struct ip_vs_service *svc)
  347. {
  348. #ifdef CONFIG_SYSCTL
  349. return svc->ipvs->sysctl_lblcr_expiration;
  350. #else
  351. return DEFAULT_EXPIRATION;
  352. #endif
  353. }
  354. static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
  355. {
  356. struct ip_vs_lblcr_table *tbl = svc->sched_data;
  357. unsigned long now = jiffies;
  358. int i, j;
  359. struct ip_vs_lblcr_entry *en;
  360. struct hlist_node *next;
  361. for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) {
  362. j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
  363. spin_lock(&svc->sched_lock);
  364. hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
  365. if (time_after(en->lastuse +
  366. sysctl_lblcr_expiration(svc), now))
  367. continue;
  368. ip_vs_lblcr_free(en);
  369. atomic_dec(&tbl->entries);
  370. }
  371. spin_unlock(&svc->sched_lock);
  372. }
  373. tbl->rover = j;
  374. }
  375. /*
  376. * Periodical timer handler for IPVS lblcr table
  377. * It is used to collect stale entries when the number of entries
  378. * exceeds the maximum size of the table.
  379. *
  380. * Fixme: we probably need more complicated algorithm to collect
  381. * entries that have not been used for a long time even
  382. * if the number of entries doesn't exceed the maximum size
  383. * of the table.
  384. * The full expiration check is for this purpose now.
  385. */
  386. static void ip_vs_lblcr_check_expire(unsigned long data)
  387. {
  388. struct ip_vs_service *svc = (struct ip_vs_service *) data;
  389. struct ip_vs_lblcr_table *tbl = svc->sched_data;
  390. unsigned long now = jiffies;
  391. int goal;
  392. int i, j;
  393. struct ip_vs_lblcr_entry *en;
  394. struct hlist_node *next;
  395. if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
  396. /* do full expiration check */
  397. ip_vs_lblcr_full_check(svc);
  398. tbl->counter = 1;
  399. goto out;
  400. }
  401. if (atomic_read(&tbl->entries) <= tbl->max_size) {
  402. tbl->counter++;
  403. goto out;
  404. }
  405. goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
  406. if (goal > tbl->max_size/2)
  407. goal = tbl->max_size/2;
  408. for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) {
  409. j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
  410. spin_lock(&svc->sched_lock);
  411. hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
  412. if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
  413. continue;
  414. ip_vs_lblcr_free(en);
  415. atomic_dec(&tbl->entries);
  416. goal--;
  417. }
  418. spin_unlock(&svc->sched_lock);
  419. if (goal <= 0)
  420. break;
  421. }
  422. tbl->rover = j;
  423. out:
  424. mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
  425. }
  426. static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
  427. {
  428. int i;
  429. struct ip_vs_lblcr_table *tbl;
  430. /*
  431. * Allocate the ip_vs_lblcr_table for this service
  432. */
  433. tbl = kmalloc(sizeof(*tbl), GFP_KERNEL);
  434. if (tbl == NULL)
  435. return -ENOMEM;
  436. svc->sched_data = tbl;
  437. IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
  438. "current service\n", sizeof(*tbl));
  439. /*
  440. * Initialize the hash buckets
  441. */
  442. for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) {
  443. INIT_HLIST_HEAD(&tbl->bucket[i]);
  444. }
  445. tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
  446. tbl->rover = 0;
  447. tbl->counter = 1;
  448. tbl->dead = 0;
  449. /*
  450. * Hook periodic timer for garbage collection
  451. */
  452. setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
  453. (unsigned long)svc);
  454. mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
  455. return 0;
  456. }
  457. static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
  458. {
  459. struct ip_vs_lblcr_table *tbl = svc->sched_data;
  460. /* remove periodic timer */
  461. del_timer_sync(&tbl->periodic_timer);
  462. /* got to clean up table entries here */
  463. ip_vs_lblcr_flush(svc);
  464. /* release the table itself */
  465. kfree_rcu(tbl, rcu_head);
  466. IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
  467. sizeof(*tbl));
  468. }
  469. static inline struct ip_vs_dest *
  470. __ip_vs_lblcr_schedule(struct ip_vs_service *svc)
  471. {
  472. struct ip_vs_dest *dest, *least;
  473. int loh, doh;
  474. /*
  475. * We use the following formula to estimate the load:
  476. * (dest overhead) / dest->weight
  477. *
  478. * Remember -- no floats in kernel mode!!!
  479. * The comparison of h1*w2 > h2*w1 is equivalent to that of
  480. * h1/w1 > h2/w2
  481. * if every weight is larger than zero.
  482. *
  483. * The server with weight=0 is quiesced and will not receive any
  484. * new connection.
  485. */
  486. list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
  487. if (dest->flags & IP_VS_DEST_F_OVERLOAD)
  488. continue;
  489. if (atomic_read(&dest->weight) > 0) {
  490. least = dest;
  491. loh = ip_vs_dest_conn_overhead(least);
  492. goto nextstage;
  493. }
  494. }
  495. return NULL;
  496. /*
  497. * Find the destination with the least load.
  498. */
  499. nextstage:
  500. list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
  501. if (dest->flags & IP_VS_DEST_F_OVERLOAD)
  502. continue;
  503. doh = ip_vs_dest_conn_overhead(dest);
  504. if ((__s64)loh * atomic_read(&dest->weight) >
  505. (__s64)doh * atomic_read(&least->weight)) {
  506. least = dest;
  507. loh = doh;
  508. }
  509. }
  510. IP_VS_DBG_BUF(6, "LBLCR: server %s:%d "
  511. "activeconns %d refcnt %d weight %d overhead %d\n",
  512. IP_VS_DBG_ADDR(least->af, &least->addr),
  513. ntohs(least->port),
  514. atomic_read(&least->activeconns),
  515. atomic_read(&least->refcnt),
  516. atomic_read(&least->weight), loh);
  517. return least;
  518. }
  519. /*
  520. * If this destination server is overloaded and there is a less loaded
  521. * server, then return true.
  522. */
  523. static inline int
  524. is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
  525. {
  526. if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
  527. struct ip_vs_dest *d;
  528. list_for_each_entry_rcu(d, &svc->destinations, n_list) {
  529. if (atomic_read(&d->activeconns)*2
  530. < atomic_read(&d->weight)) {
  531. return 1;
  532. }
  533. }
  534. }
  535. return 0;
  536. }
  537. /*
  538. * Locality-Based (weighted) Least-Connection scheduling
  539. */
  540. static struct ip_vs_dest *
  541. ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
  542. struct ip_vs_iphdr *iph)
  543. {
  544. struct ip_vs_lblcr_table *tbl = svc->sched_data;
  545. struct ip_vs_dest *dest;
  546. struct ip_vs_lblcr_entry *en;
  547. IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
  548. /* First look in our cache */
  549. en = ip_vs_lblcr_get(svc->af, tbl, &iph->daddr);
  550. if (en) {
  551. en->lastuse = jiffies;
  552. /* Get the least loaded destination */
  553. dest = ip_vs_dest_set_min(&en->set);
  554. /* More than one destination + enough time passed by, cleanup */
  555. if (atomic_read(&en->set.size) > 1 &&
  556. time_after(jiffies, en->set.lastmod +
  557. sysctl_lblcr_expiration(svc))) {
  558. spin_lock_bh(&svc->sched_lock);
  559. if (atomic_read(&en->set.size) > 1) {
  560. struct ip_vs_dest *m;
  561. m = ip_vs_dest_set_max(&en->set);
  562. if (m)
  563. ip_vs_dest_set_erase(&en->set, m);
  564. }
  565. spin_unlock_bh(&svc->sched_lock);
  566. }
  567. /* If the destination is not overloaded, use it */
  568. if (dest && !is_overloaded(dest, svc))
  569. goto out;
  570. /* The cache entry is invalid, time to schedule */
  571. dest = __ip_vs_lblcr_schedule(svc);
  572. if (!dest) {
  573. ip_vs_scheduler_err(svc, "no destination available");
  574. return NULL;
  575. }
  576. /* Update our cache entry */
  577. spin_lock_bh(&svc->sched_lock);
  578. if (!tbl->dead)
  579. ip_vs_dest_set_insert(&en->set, dest, true);
  580. spin_unlock_bh(&svc->sched_lock);
  581. goto out;
  582. }
  583. /* No cache entry, time to schedule */
  584. dest = __ip_vs_lblcr_schedule(svc);
  585. if (!dest) {
  586. IP_VS_DBG(1, "no destination available\n");
  587. return NULL;
  588. }
  589. /* If we fail to create a cache entry, we'll just use the valid dest */
  590. spin_lock_bh(&svc->sched_lock);
  591. if (!tbl->dead)
  592. ip_vs_lblcr_new(tbl, &iph->daddr, svc->af, dest);
  593. spin_unlock_bh(&svc->sched_lock);
  594. out:
  595. IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n",
  596. IP_VS_DBG_ADDR(svc->af, &iph->daddr),
  597. IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
  598. return dest;
  599. }
  600. /*
  601. * IPVS LBLCR Scheduler structure
  602. */
  603. static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
  604. {
  605. .name = "lblcr",
  606. .refcnt = ATOMIC_INIT(0),
  607. .module = THIS_MODULE,
  608. .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list),
  609. .init_service = ip_vs_lblcr_init_svc,
  610. .done_service = ip_vs_lblcr_done_svc,
  611. .schedule = ip_vs_lblcr_schedule,
  612. };
  613. /*
  614. * per netns init.
  615. */
  616. #ifdef CONFIG_SYSCTL
  617. static int __net_init __ip_vs_lblcr_init(struct net *net)
  618. {
  619. struct netns_ipvs *ipvs = net_ipvs(net);
  620. if (!ipvs)
  621. return -ENOENT;
  622. if (!net_eq(net, &init_net)) {
  623. ipvs->lblcr_ctl_table = kmemdup(vs_vars_table,
  624. sizeof(vs_vars_table),
  625. GFP_KERNEL);
  626. if (ipvs->lblcr_ctl_table == NULL)
  627. return -ENOMEM;
  628. /* Don't export sysctls to unprivileged users */
  629. if (net->user_ns != &init_user_ns)
  630. ipvs->lblcr_ctl_table[0].procname = NULL;
  631. } else
  632. ipvs->lblcr_ctl_table = vs_vars_table;
  633. ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION;
  634. ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration;
  635. ipvs->lblcr_ctl_header =
  636. register_net_sysctl(net, "net/ipv4/vs", ipvs->lblcr_ctl_table);
  637. if (!ipvs->lblcr_ctl_header) {
  638. if (!net_eq(net, &init_net))
  639. kfree(ipvs->lblcr_ctl_table);
  640. return -ENOMEM;
  641. }
  642. return 0;
  643. }
  644. static void __net_exit __ip_vs_lblcr_exit(struct net *net)
  645. {
  646. struct netns_ipvs *ipvs = net_ipvs(net);
  647. unregister_net_sysctl_table(ipvs->lblcr_ctl_header);
  648. if (!net_eq(net, &init_net))
  649. kfree(ipvs->lblcr_ctl_table);
  650. }
  651. #else
  652. static int __net_init __ip_vs_lblcr_init(struct net *net) { return 0; }
  653. static void __net_exit __ip_vs_lblcr_exit(struct net *net) { }
  654. #endif
  655. static struct pernet_operations ip_vs_lblcr_ops = {
  656. .init = __ip_vs_lblcr_init,
  657. .exit = __ip_vs_lblcr_exit,
  658. };
  659. static int __init ip_vs_lblcr_init(void)
  660. {
  661. int ret;
  662. ret = register_pernet_subsys(&ip_vs_lblcr_ops);
  663. if (ret)
  664. return ret;
  665. ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
  666. if (ret)
  667. unregister_pernet_subsys(&ip_vs_lblcr_ops);
  668. return ret;
  669. }
  670. static void __exit ip_vs_lblcr_cleanup(void)
  671. {
  672. unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
  673. unregister_pernet_subsys(&ip_vs_lblcr_ops);
  674. rcu_barrier();
  675. }
  676. module_init(ip_vs_lblcr_init);
  677. module_exit(ip_vs_lblcr_cleanup);
  678. MODULE_LICENSE("GPL");