ip_vs_sh.c 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382
  1. /*
  2. * IPVS: Source Hashing scheduling module
  3. *
  4. * Authors: Wensong Zhang <wensong@gnuchina.org>
  5. *
  6. * This program is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU General Public License
  8. * as published by the Free Software Foundation; either version
  9. * 2 of the License, or (at your option) any later version.
  10. *
  11. * Changes:
  12. *
  13. */
  14. /*
  15. * The sh algorithm is to select server by the hash key of source IP
  16. * address. The pseudo code is as follows:
  17. *
  18. * n <- servernode[src_ip];
  19. * if (n is dead) OR
  20. * (n is overloaded) or (n.weight <= 0) then
  21. * return NULL;
  22. *
  23. * return n;
  24. *
  25. * Notes that servernode is a 256-bucket hash table that maps the hash
  26. * index derived from packet source IP address to the current server
  27. * array. If the sh scheduler is used in cache cluster, it is good to
  28. * combine it with cache_bypass feature. When the statically assigned
  29. * server is dead or overloaded, the load balancer can bypass the cache
  30. * server and send requests to the original server directly.
  31. *
  32. * The weight destination attribute can be used to control the
  33. * distribution of connections to the destinations in servernode. The
  34. * greater the weight, the more connections the destination
  35. * will receive.
  36. *
  37. */
  38. #define KMSG_COMPONENT "IPVS"
  39. #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  40. #include <linux/ip.h>
  41. #include <linux/slab.h>
  42. #include <linux/module.h>
  43. #include <linux/kernel.h>
  44. #include <linux/skbuff.h>
  45. #include <net/ip_vs.h>
  46. #include <net/tcp.h>
  47. #include <linux/udp.h>
  48. #include <linux/sctp.h>
  49. /*
  50. * IPVS SH bucket
  51. */
  52. struct ip_vs_sh_bucket {
  53. struct ip_vs_dest __rcu *dest; /* real server (cache) */
  54. };
  55. /*
  56. * for IPVS SH entry hash table
  57. */
  58. #ifndef CONFIG_IP_VS_SH_TAB_BITS
  59. #define CONFIG_IP_VS_SH_TAB_BITS 8
  60. #endif
  61. #define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS
  62. #define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS)
  63. #define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1)
  64. struct ip_vs_sh_state {
  65. struct rcu_head rcu_head;
  66. struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE];
  67. };
  68. /* Helper function to determine if server is unavailable */
  69. static inline bool is_unavailable(struct ip_vs_dest *dest)
  70. {
  71. return atomic_read(&dest->weight) <= 0 ||
  72. dest->flags & IP_VS_DEST_F_OVERLOAD;
  73. }
  74. /*
  75. * Returns hash value for IPVS SH entry
  76. */
  77. static inline unsigned int
  78. ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
  79. __be16 port, unsigned int offset)
  80. {
  81. __be32 addr_fold = addr->ip;
  82. #ifdef CONFIG_IP_VS_IPV6
  83. if (af == AF_INET6)
  84. addr_fold = addr->ip6[0]^addr->ip6[1]^
  85. addr->ip6[2]^addr->ip6[3];
  86. #endif
  87. return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
  88. IP_VS_SH_TAB_MASK;
  89. }
  90. /*
  91. * Get ip_vs_dest associated with supplied parameters.
  92. */
  93. static inline struct ip_vs_dest *
  94. ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
  95. const union nf_inet_addr *addr, __be16 port)
  96. {
  97. unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
  98. struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest);
  99. return (!dest || is_unavailable(dest)) ? NULL : dest;
  100. }
  101. /* As ip_vs_sh_get, but with fallback if selected server is unavailable
  102. *
  103. * The fallback strategy loops around the table starting from a "random"
  104. * point (in fact, it is chosen to be the original hash value to make the
  105. * algorithm deterministic) to find a new server.
  106. */
  107. static inline struct ip_vs_dest *
  108. ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
  109. const union nf_inet_addr *addr, __be16 port)
  110. {
  111. unsigned int offset, roffset;
  112. unsigned int hash, ihash;
  113. struct ip_vs_dest *dest;
  114. /* first try the dest it's supposed to go to */
  115. ihash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
  116. dest = rcu_dereference(s->buckets[ihash].dest);
  117. if (!dest)
  118. return NULL;
  119. if (!is_unavailable(dest))
  120. return dest;
  121. IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting",
  122. IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
  123. /* if the original dest is unavailable, loop around the table
  124. * starting from ihash to find a new dest
  125. */
  126. for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
  127. roffset = (offset + ihash) % IP_VS_SH_TAB_SIZE;
  128. hash = ip_vs_sh_hashkey(svc->af, addr, port, roffset);
  129. dest = rcu_dereference(s->buckets[hash].dest);
  130. if (!dest)
  131. break;
  132. if (!is_unavailable(dest))
  133. return dest;
  134. IP_VS_DBG_BUF(6, "SH: selected unavailable "
  135. "server %s:%d (offset %d), reselecting",
  136. IP_VS_DBG_ADDR(dest->af, &dest->addr),
  137. ntohs(dest->port), roffset);
  138. }
  139. return NULL;
  140. }
  141. /*
  142. * Assign all the hash buckets of the specified table with the service.
  143. */
  144. static int
  145. ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc)
  146. {
  147. int i;
  148. struct ip_vs_sh_bucket *b;
  149. struct list_head *p;
  150. struct ip_vs_dest *dest;
  151. int d_count;
  152. bool empty;
  153. b = &s->buckets[0];
  154. p = &svc->destinations;
  155. empty = list_empty(p);
  156. d_count = 0;
  157. for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
  158. dest = rcu_dereference_protected(b->dest, 1);
  159. if (dest)
  160. ip_vs_dest_put(dest);
  161. if (empty)
  162. RCU_INIT_POINTER(b->dest, NULL);
  163. else {
  164. if (p == &svc->destinations)
  165. p = p->next;
  166. dest = list_entry(p, struct ip_vs_dest, n_list);
  167. ip_vs_dest_hold(dest);
  168. RCU_INIT_POINTER(b->dest, dest);
  169. IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n",
  170. i, IP_VS_DBG_ADDR(dest->af, &dest->addr),
  171. atomic_read(&dest->weight));
  172. /* Don't move to next dest until filling weight */
  173. if (++d_count >= atomic_read(&dest->weight)) {
  174. p = p->next;
  175. d_count = 0;
  176. }
  177. }
  178. b++;
  179. }
  180. return 0;
  181. }
  182. /*
  183. * Flush all the hash buckets of the specified table.
  184. */
  185. static void ip_vs_sh_flush(struct ip_vs_sh_state *s)
  186. {
  187. int i;
  188. struct ip_vs_sh_bucket *b;
  189. struct ip_vs_dest *dest;
  190. b = &s->buckets[0];
  191. for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
  192. dest = rcu_dereference_protected(b->dest, 1);
  193. if (dest) {
  194. ip_vs_dest_put(dest);
  195. RCU_INIT_POINTER(b->dest, NULL);
  196. }
  197. b++;
  198. }
  199. }
  200. static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
  201. {
  202. struct ip_vs_sh_state *s;
  203. /* allocate the SH table for this service */
  204. s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL);
  205. if (s == NULL)
  206. return -ENOMEM;
  207. svc->sched_data = s;
  208. IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
  209. "current service\n",
  210. sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
  211. /* assign the hash buckets with current dests */
  212. ip_vs_sh_reassign(s, svc);
  213. return 0;
  214. }
  215. static void ip_vs_sh_done_svc(struct ip_vs_service *svc)
  216. {
  217. struct ip_vs_sh_state *s = svc->sched_data;
  218. /* got to clean up hash buckets here */
  219. ip_vs_sh_flush(s);
  220. /* release the table itself */
  221. kfree_rcu(s, rcu_head);
  222. IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
  223. sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
  224. }
  225. static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
  226. struct ip_vs_dest *dest)
  227. {
  228. struct ip_vs_sh_state *s = svc->sched_data;
  229. /* assign the hash buckets with the updated service */
  230. ip_vs_sh_reassign(s, svc);
  231. return 0;
  232. }
  233. /* Helper function to get port number */
  234. static inline __be16
  235. ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
  236. {
  237. __be16 _ports[2], *ports;
  238. /* At this point we know that we have a valid packet of some kind.
  239. * Because ICMP packets are only guaranteed to have the first 8
  240. * bytes, let's just grab the ports. Fortunately they're in the
  241. * same position for all three of the protocols we care about.
  242. */
  243. switch (iph->protocol) {
  244. case IPPROTO_TCP:
  245. case IPPROTO_UDP:
  246. case IPPROTO_SCTP:
  247. ports = skb_header_pointer(skb, iph->len, sizeof(_ports),
  248. &_ports);
  249. if (unlikely(!ports))
  250. return 0;
  251. if (likely(!ip_vs_iph_inverse(iph)))
  252. return ports[0];
  253. else
  254. return ports[1];
  255. default:
  256. return 0;
  257. }
  258. }
  259. /*
  260. * Source Hashing scheduling
  261. */
  262. static struct ip_vs_dest *
  263. ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
  264. struct ip_vs_iphdr *iph)
  265. {
  266. struct ip_vs_dest *dest;
  267. struct ip_vs_sh_state *s;
  268. __be16 port = 0;
  269. const union nf_inet_addr *hash_addr;
  270. hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr;
  271. IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
  272. if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT)
  273. port = ip_vs_sh_get_port(skb, iph);
  274. s = (struct ip_vs_sh_state *) svc->sched_data;
  275. if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
  276. dest = ip_vs_sh_get_fallback(svc, s, hash_addr, port);
  277. else
  278. dest = ip_vs_sh_get(svc, s, hash_addr, port);
  279. if (!dest) {
  280. ip_vs_scheduler_err(svc, "no destination available");
  281. return NULL;
  282. }
  283. IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n",
  284. IP_VS_DBG_ADDR(svc->af, hash_addr),
  285. IP_VS_DBG_ADDR(dest->af, &dest->addr),
  286. ntohs(dest->port));
  287. return dest;
  288. }
  289. /*
  290. * IPVS SH Scheduler structure
  291. */
  292. static struct ip_vs_scheduler ip_vs_sh_scheduler =
  293. {
  294. .name = "sh",
  295. .refcnt = ATOMIC_INIT(0),
  296. .module = THIS_MODULE,
  297. .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
  298. .init_service = ip_vs_sh_init_svc,
  299. .done_service = ip_vs_sh_done_svc,
  300. .add_dest = ip_vs_sh_dest_changed,
  301. .del_dest = ip_vs_sh_dest_changed,
  302. .upd_dest = ip_vs_sh_dest_changed,
  303. .schedule = ip_vs_sh_schedule,
  304. };
  305. static int __init ip_vs_sh_init(void)
  306. {
  307. return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
  308. }
  309. static void __exit ip_vs_sh_cleanup(void)
  310. {
  311. unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
  312. synchronize_rcu();
  313. }
  314. module_init(ip_vs_sh_init);
  315. module_exit(ip_vs_sh_cleanup);
  316. MODULE_LICENSE("GPL");