ip_vs_sync.c 54 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038
  1. /*
  2. * IPVS An implementation of the IP virtual server support for the
  3. * LINUX operating system. IPVS is now implemented as a module
  4. * over the NetFilter framework. IPVS can be used to build a
  5. * high-performance and highly available server based on a
  6. * cluster of servers.
  7. *
  8. * Version 1, is capable of handling both version 0 and 1 messages.
  9. * Version 0 is the plain old format.
  10. * Note Version 0 receivers will just drop Ver 1 messages.
  11. * Version 1 is capable of handle IPv6, Persistence data,
  12. * time-outs, and firewall marks.
  13. * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order.
  14. * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0
  15. *
  16. * Definitions Message: is a complete datagram
  17. * Sync_conn: is a part of a Message
  18. * Param Data is an option to a Sync_conn.
  19. *
  20. * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
  21. *
  22. * ip_vs_sync: sync connection info from master load balancer to backups
  23. * through multicast
  24. *
  25. * Changes:
  26. * Alexandre Cassen : Added master & backup support at a time.
  27. * Alexandre Cassen : Added SyncID support for incoming sync
  28. * messages filtering.
  29. * Justin Ossevoort : Fix endian problem on sync message size.
  30. * Hans Schillstrom : Added Version 1: i.e. IPv6,
  31. * Persistence support, fwmark and time-out.
  32. */
  33. #define KMSG_COMPONENT "IPVS"
  34. #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  35. #include <linux/module.h>
  36. #include <linux/slab.h>
  37. #include <linux/inetdevice.h>
  38. #include <linux/net.h>
  39. #include <linux/completion.h>
  40. #include <linux/delay.h>
  41. #include <linux/skbuff.h>
  42. #include <linux/in.h>
  43. #include <linux/igmp.h> /* for ip_mc_join_group */
  44. #include <linux/udp.h>
  45. #include <linux/err.h>
  46. #include <linux/kthread.h>
  47. #include <linux/wait.h>
  48. #include <linux/kernel.h>
  49. #include <linux/sched.h>
  50. #include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */
  51. #include <net/ip.h>
  52. #include <net/sock.h>
  53. #include <net/ip_vs.h>
  54. #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
  55. #define IP_VS_SYNC_PORT 8848 /* multicast port */
  56. #define SYNC_PROTO_VER 1 /* Protocol version in header */
  57. static struct lock_class_key __ipvs_sync_key;
  58. /*
  59. * IPVS sync connection entry
  60. * Version 0, i.e. original version.
  61. */
  62. struct ip_vs_sync_conn_v0 {
  63. __u8 reserved;
  64. /* Protocol, addresses and port numbers */
  65. __u8 protocol; /* Which protocol (TCP/UDP) */
  66. __be16 cport;
  67. __be16 vport;
  68. __be16 dport;
  69. __be32 caddr; /* client address */
  70. __be32 vaddr; /* virtual address */
  71. __be32 daddr; /* destination address */
  72. /* Flags and state transition */
  73. __be16 flags; /* status flags */
  74. __be16 state; /* state info */
  75. /* The sequence options start here */
  76. };
  77. struct ip_vs_sync_conn_options {
  78. struct ip_vs_seq in_seq; /* incoming seq. struct */
  79. struct ip_vs_seq out_seq; /* outgoing seq. struct */
  80. };
  81. /*
  82. Sync Connection format (sync_conn)
  83. 0 1 2 3
  84. 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  85. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  86. | Type | Protocol | Ver. | Size |
  87. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  88. | Flags |
  89. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  90. | State | cport |
  91. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  92. | vport | dport |
  93. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  94. | fwmark |
  95. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  96. | timeout (in sec.) |
  97. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  98. | ... |
  99. | IP-Addresses (v4 or v6) |
  100. | ... |
  101. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  102. Optional Parameters.
  103. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  104. | Param. Type | Param. Length | Param. data |
  105. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
  106. | ... |
  107. | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  108. | | Param Type | Param. Length |
  109. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  110. | Param data |
  111. | Last Param data should be padded for 32 bit alignment |
  112. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  113. */
  114. /*
  115. * Type 0, IPv4 sync connection format
  116. */
  117. struct ip_vs_sync_v4 {
  118. __u8 type;
  119. __u8 protocol; /* Which protocol (TCP/UDP) */
  120. __be16 ver_size; /* Version msb 4 bits */
  121. /* Flags and state transition */
  122. __be32 flags; /* status flags */
  123. __be16 state; /* state info */
  124. /* Protocol, addresses and port numbers */
  125. __be16 cport;
  126. __be16 vport;
  127. __be16 dport;
  128. __be32 fwmark; /* Firewall mark from skb */
  129. __be32 timeout; /* cp timeout */
  130. __be32 caddr; /* client address */
  131. __be32 vaddr; /* virtual address */
  132. __be32 daddr; /* destination address */
  133. /* The sequence options start here */
  134. /* PE data padded to 32bit alignment after seq. options */
  135. };
  136. /*
  137. * Type 2 messages IPv6
  138. */
  139. struct ip_vs_sync_v6 {
  140. __u8 type;
  141. __u8 protocol; /* Which protocol (TCP/UDP) */
  142. __be16 ver_size; /* Version msb 4 bits */
  143. /* Flags and state transition */
  144. __be32 flags; /* status flags */
  145. __be16 state; /* state info */
  146. /* Protocol, addresses and port numbers */
  147. __be16 cport;
  148. __be16 vport;
  149. __be16 dport;
  150. __be32 fwmark; /* Firewall mark from skb */
  151. __be32 timeout; /* cp timeout */
  152. struct in6_addr caddr; /* client address */
  153. struct in6_addr vaddr; /* virtual address */
  154. struct in6_addr daddr; /* destination address */
  155. /* The sequence options start here */
  156. /* PE data padded to 32bit alignment after seq. options */
  157. };
  158. union ip_vs_sync_conn {
  159. struct ip_vs_sync_v4 v4;
  160. struct ip_vs_sync_v6 v6;
  161. };
  162. /* Bits in Type field in above */
  163. #define STYPE_INET6 0
  164. #define STYPE_F_INET6 (1 << STYPE_INET6)
  165. #define SVER_SHIFT 12 /* Shift to get version */
  166. #define SVER_MASK 0x0fff /* Mask to strip version */
  167. #define IPVS_OPT_SEQ_DATA 1
  168. #define IPVS_OPT_PE_DATA 2
  169. #define IPVS_OPT_PE_NAME 3
  170. #define IPVS_OPT_PARAM 7
  171. #define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1))
  172. #define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1))
  173. #define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1))
  174. #define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
  175. struct ip_vs_sync_thread_data {
  176. struct netns_ipvs *ipvs;
  177. struct socket *sock;
  178. char *buf;
  179. int id;
  180. };
  181. /* Version 0 definition of packet sizes */
  182. #define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0))
  183. #define FULL_CONN_SIZE \
  184. (sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
  185. /*
  186. The master mulitcasts messages (Datagrams) to the backup load balancers
  187. in the following format.
  188. Version 1:
  189. Note, first byte should be Zero, so ver 0 receivers will drop the packet.
  190. 0 1 2 3
  191. 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  192. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  193. | 0 | SyncID | Size |
  194. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  195. | Count Conns | Version | Reserved, set to Zero |
  196. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  197. | |
  198. | IPVS Sync Connection (1) |
  199. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  200. | . |
  201. ~ . ~
  202. | . |
  203. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  204. | |
  205. | IPVS Sync Connection (n) |
  206. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  207. Version 0 Header
  208. 0 1 2 3
  209. 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  210. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  211. | Count Conns | SyncID | Size |
  212. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  213. | IPVS Sync Connection (1) |
  214. */
  215. #define SYNC_MESG_HEADER_LEN 4
  216. #define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
  217. /* Version 0 header */
  218. struct ip_vs_sync_mesg_v0 {
  219. __u8 nr_conns;
  220. __u8 syncid;
  221. __be16 size;
  222. /* ip_vs_sync_conn entries start here */
  223. };
  224. /* Version 1 header */
  225. struct ip_vs_sync_mesg {
  226. __u8 reserved; /* must be zero */
  227. __u8 syncid;
  228. __be16 size;
  229. __u8 nr_conns;
  230. __s8 version; /* SYNC_PROTO_VER */
  231. __u16 spare;
  232. /* ip_vs_sync_conn entries start here */
  233. };
  234. union ipvs_sockaddr {
  235. struct sockaddr_in in;
  236. struct sockaddr_in6 in6;
  237. };
  238. struct ip_vs_sync_buff {
  239. struct list_head list;
  240. unsigned long firstuse;
  241. /* pointers for the message data */
  242. struct ip_vs_sync_mesg *mesg;
  243. unsigned char *head;
  244. unsigned char *end;
  245. };
  246. /*
  247. * Copy of struct ip_vs_seq
  248. * From unaligned network order to aligned host order
  249. */
  250. static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
  251. {
  252. ho->init_seq = get_unaligned_be32(&no->init_seq);
  253. ho->delta = get_unaligned_be32(&no->delta);
  254. ho->previous_delta = get_unaligned_be32(&no->previous_delta);
  255. }
  256. /*
  257. * Copy of struct ip_vs_seq
  258. * From Aligned host order to unaligned network order
  259. */
  260. static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
  261. {
  262. put_unaligned_be32(ho->init_seq, &no->init_seq);
  263. put_unaligned_be32(ho->delta, &no->delta);
  264. put_unaligned_be32(ho->previous_delta, &no->previous_delta);
  265. }
  266. static inline struct ip_vs_sync_buff *
  267. sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
  268. {
  269. struct ip_vs_sync_buff *sb;
  270. spin_lock_bh(&ipvs->sync_lock);
  271. if (list_empty(&ms->sync_queue)) {
  272. sb = NULL;
  273. __set_current_state(TASK_INTERRUPTIBLE);
  274. } else {
  275. sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff,
  276. list);
  277. list_del(&sb->list);
  278. ms->sync_queue_len--;
  279. if (!ms->sync_queue_len)
  280. ms->sync_queue_delay = 0;
  281. }
  282. spin_unlock_bh(&ipvs->sync_lock);
  283. return sb;
  284. }
  285. /*
  286. * Create a new sync buffer for Version 1 proto.
  287. */
  288. static inline struct ip_vs_sync_buff *
  289. ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len)
  290. {
  291. struct ip_vs_sync_buff *sb;
  292. if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
  293. return NULL;
  294. len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg),
  295. ipvs->mcfg.sync_maxlen);
  296. sb->mesg = kmalloc(len, GFP_ATOMIC);
  297. if (!sb->mesg) {
  298. kfree(sb);
  299. return NULL;
  300. }
  301. sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */
  302. sb->mesg->version = SYNC_PROTO_VER;
  303. sb->mesg->syncid = ipvs->mcfg.syncid;
  304. sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg));
  305. sb->mesg->nr_conns = 0;
  306. sb->mesg->spare = 0;
  307. sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
  308. sb->end = (unsigned char *)sb->mesg + len;
  309. sb->firstuse = jiffies;
  310. return sb;
  311. }
  312. static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
  313. {
  314. kfree(sb->mesg);
  315. kfree(sb);
  316. }
  317. static inline void sb_queue_tail(struct netns_ipvs *ipvs,
  318. struct ipvs_master_sync_state *ms)
  319. {
  320. struct ip_vs_sync_buff *sb = ms->sync_buff;
  321. spin_lock(&ipvs->sync_lock);
  322. if (ipvs->sync_state & IP_VS_STATE_MASTER &&
  323. ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) {
  324. if (!ms->sync_queue_len)
  325. schedule_delayed_work(&ms->master_wakeup_work,
  326. max(IPVS_SYNC_SEND_DELAY, 1));
  327. ms->sync_queue_len++;
  328. list_add_tail(&sb->list, &ms->sync_queue);
  329. if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE)
  330. wake_up_process(ms->master_thread);
  331. } else
  332. ip_vs_sync_buff_release(sb);
  333. spin_unlock(&ipvs->sync_lock);
  334. }
  335. /*
  336. * Get the current sync buffer if it has been created for more
  337. * than the specified time or the specified time is zero.
  338. */
  339. static inline struct ip_vs_sync_buff *
  340. get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms,
  341. unsigned long time)
  342. {
  343. struct ip_vs_sync_buff *sb;
  344. spin_lock_bh(&ipvs->sync_buff_lock);
  345. sb = ms->sync_buff;
  346. if (sb && time_after_eq(jiffies - sb->firstuse, time)) {
  347. ms->sync_buff = NULL;
  348. __set_current_state(TASK_RUNNING);
  349. } else
  350. sb = NULL;
  351. spin_unlock_bh(&ipvs->sync_buff_lock);
  352. return sb;
  353. }
  354. static inline int
  355. select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp)
  356. {
  357. return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask;
  358. }
  359. /*
  360. * Create a new sync buffer for Version 0 proto.
  361. */
  362. static inline struct ip_vs_sync_buff *
  363. ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len)
  364. {
  365. struct ip_vs_sync_buff *sb;
  366. struct ip_vs_sync_mesg_v0 *mesg;
  367. if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
  368. return NULL;
  369. len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0),
  370. ipvs->mcfg.sync_maxlen);
  371. sb->mesg = kmalloc(len, GFP_ATOMIC);
  372. if (!sb->mesg) {
  373. kfree(sb);
  374. return NULL;
  375. }
  376. mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
  377. mesg->nr_conns = 0;
  378. mesg->syncid = ipvs->mcfg.syncid;
  379. mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0));
  380. sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
  381. sb->end = (unsigned char *)mesg + len;
  382. sb->firstuse = jiffies;
  383. return sb;
  384. }
  385. /* Check if connection is controlled by persistence */
  386. static inline bool in_persistence(struct ip_vs_conn *cp)
  387. {
  388. for (cp = cp->control; cp; cp = cp->control) {
  389. if (cp->flags & IP_VS_CONN_F_TEMPLATE)
  390. return true;
  391. }
  392. return false;
  393. }
  394. /* Check if conn should be synced.
  395. * pkts: conn packets, use sysctl_sync_threshold to avoid packet check
  396. * - (1) sync_refresh_period: reduce sync rate. Additionally, retry
  397. * sync_retries times with period of sync_refresh_period/8
  398. * - (2) if both sync_refresh_period and sync_period are 0 send sync only
  399. * for state changes or only once when pkts matches sync_threshold
  400. * - (3) templates: rate can be reduced only with sync_refresh_period or
  401. * with (2)
  402. */
  403. static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs,
  404. struct ip_vs_conn *cp, int pkts)
  405. {
  406. unsigned long orig = ACCESS_ONCE(cp->sync_endtime);
  407. unsigned long now = jiffies;
  408. unsigned long n = (now + cp->timeout) & ~3UL;
  409. unsigned int sync_refresh_period;
  410. int sync_period;
  411. int force;
  412. /* Check if we sync in current state */
  413. if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE))
  414. force = 0;
  415. else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp)))
  416. return 0;
  417. else if (likely(cp->protocol == IPPROTO_TCP)) {
  418. if (!((1 << cp->state) &
  419. ((1 << IP_VS_TCP_S_ESTABLISHED) |
  420. (1 << IP_VS_TCP_S_FIN_WAIT) |
  421. (1 << IP_VS_TCP_S_CLOSE) |
  422. (1 << IP_VS_TCP_S_CLOSE_WAIT) |
  423. (1 << IP_VS_TCP_S_TIME_WAIT))))
  424. return 0;
  425. force = cp->state != cp->old_state;
  426. if (force && cp->state != IP_VS_TCP_S_ESTABLISHED)
  427. goto set;
  428. } else if (unlikely(cp->protocol == IPPROTO_SCTP)) {
  429. if (!((1 << cp->state) &
  430. ((1 << IP_VS_SCTP_S_ESTABLISHED) |
  431. (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) |
  432. (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) |
  433. (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) |
  434. (1 << IP_VS_SCTP_S_CLOSED))))
  435. return 0;
  436. force = cp->state != cp->old_state;
  437. if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED)
  438. goto set;
  439. } else {
  440. /* UDP or another protocol with single state */
  441. force = 0;
  442. }
  443. sync_refresh_period = sysctl_sync_refresh_period(ipvs);
  444. if (sync_refresh_period > 0) {
  445. long diff = n - orig;
  446. long min_diff = max(cp->timeout >> 1, 10UL * HZ);
  447. /* Avoid sync if difference is below sync_refresh_period
  448. * and below the half timeout.
  449. */
  450. if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) {
  451. int retries = orig & 3;
  452. if (retries >= sysctl_sync_retries(ipvs))
  453. return 0;
  454. if (time_before(now, orig - cp->timeout +
  455. (sync_refresh_period >> 3)))
  456. return 0;
  457. n |= retries + 1;
  458. }
  459. }
  460. sync_period = sysctl_sync_period(ipvs);
  461. if (sync_period > 0) {
  462. if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) &&
  463. pkts % sync_period != sysctl_sync_threshold(ipvs))
  464. return 0;
  465. } else if (sync_refresh_period <= 0 &&
  466. pkts != sysctl_sync_threshold(ipvs))
  467. return 0;
  468. set:
  469. cp->old_state = cp->state;
  470. n = cmpxchg(&cp->sync_endtime, orig, n);
  471. return n == orig || force;
  472. }
  473. /*
  474. * Version 0 , could be switched in by sys_ctl.
  475. * Add an ip_vs_conn information into the current sync_buff.
  476. */
  477. static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp,
  478. int pkts)
  479. {
  480. struct ip_vs_sync_mesg_v0 *m;
  481. struct ip_vs_sync_conn_v0 *s;
  482. struct ip_vs_sync_buff *buff;
  483. struct ipvs_master_sync_state *ms;
  484. int id;
  485. unsigned int len;
  486. if (unlikely(cp->af != AF_INET))
  487. return;
  488. /* Do not sync ONE PACKET */
  489. if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
  490. return;
  491. if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
  492. return;
  493. spin_lock_bh(&ipvs->sync_buff_lock);
  494. if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
  495. spin_unlock_bh(&ipvs->sync_buff_lock);
  496. return;
  497. }
  498. id = select_master_thread_id(ipvs, cp);
  499. ms = &ipvs->ms[id];
  500. buff = ms->sync_buff;
  501. len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
  502. SIMPLE_CONN_SIZE;
  503. if (buff) {
  504. m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
  505. /* Send buffer if it is for v1 */
  506. if (buff->head + len > buff->end || !m->nr_conns) {
  507. sb_queue_tail(ipvs, ms);
  508. ms->sync_buff = NULL;
  509. buff = NULL;
  510. }
  511. }
  512. if (!buff) {
  513. buff = ip_vs_sync_buff_create_v0(ipvs, len);
  514. if (!buff) {
  515. spin_unlock_bh(&ipvs->sync_buff_lock);
  516. pr_err("ip_vs_sync_buff_create failed.\n");
  517. return;
  518. }
  519. ms->sync_buff = buff;
  520. }
  521. m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
  522. s = (struct ip_vs_sync_conn_v0 *) buff->head;
  523. /* copy members */
  524. s->reserved = 0;
  525. s->protocol = cp->protocol;
  526. s->cport = cp->cport;
  527. s->vport = cp->vport;
  528. s->dport = cp->dport;
  529. s->caddr = cp->caddr.ip;
  530. s->vaddr = cp->vaddr.ip;
  531. s->daddr = cp->daddr.ip;
  532. s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
  533. s->state = htons(cp->state);
  534. if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
  535. struct ip_vs_sync_conn_options *opt =
  536. (struct ip_vs_sync_conn_options *)&s[1];
  537. memcpy(opt, &cp->in_seq, sizeof(*opt));
  538. }
  539. m->nr_conns++;
  540. m->size = htons(ntohs(m->size) + len);
  541. buff->head += len;
  542. spin_unlock_bh(&ipvs->sync_buff_lock);
  543. /* synchronize its controller if it has */
  544. cp = cp->control;
  545. if (cp) {
  546. if (cp->flags & IP_VS_CONN_F_TEMPLATE)
  547. pkts = atomic_add_return(1, &cp->in_pkts);
  548. else
  549. pkts = sysctl_sync_threshold(ipvs);
  550. ip_vs_sync_conn(ipvs, cp, pkts);
  551. }
  552. }
  553. /*
  554. * Add an ip_vs_conn information into the current sync_buff.
  555. * Called by ip_vs_in.
  556. * Sending Version 1 messages
  557. */
  558. void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts)
  559. {
  560. struct ip_vs_sync_mesg *m;
  561. union ip_vs_sync_conn *s;
  562. struct ip_vs_sync_buff *buff;
  563. struct ipvs_master_sync_state *ms;
  564. int id;
  565. __u8 *p;
  566. unsigned int len, pe_name_len, pad;
  567. /* Handle old version of the protocol */
  568. if (sysctl_sync_ver(ipvs) == 0) {
  569. ip_vs_sync_conn_v0(ipvs, cp, pkts);
  570. return;
  571. }
  572. /* Do not sync ONE PACKET */
  573. if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
  574. goto control;
  575. sloop:
  576. if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
  577. goto control;
  578. /* Sanity checks */
  579. pe_name_len = 0;
  580. if (cp->pe_data_len) {
  581. if (!cp->pe_data || !cp->dest) {
  582. IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
  583. return;
  584. }
  585. pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
  586. }
  587. spin_lock_bh(&ipvs->sync_buff_lock);
  588. if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
  589. spin_unlock_bh(&ipvs->sync_buff_lock);
  590. return;
  591. }
  592. id = select_master_thread_id(ipvs, cp);
  593. ms = &ipvs->ms[id];
  594. #ifdef CONFIG_IP_VS_IPV6
  595. if (cp->af == AF_INET6)
  596. len = sizeof(struct ip_vs_sync_v6);
  597. else
  598. #endif
  599. len = sizeof(struct ip_vs_sync_v4);
  600. if (cp->flags & IP_VS_CONN_F_SEQ_MASK)
  601. len += sizeof(struct ip_vs_sync_conn_options) + 2;
  602. if (cp->pe_data_len)
  603. len += cp->pe_data_len + 2; /* + Param hdr field */
  604. if (pe_name_len)
  605. len += pe_name_len + 2;
  606. /* check if there is a space for this one */
  607. pad = 0;
  608. buff = ms->sync_buff;
  609. if (buff) {
  610. m = buff->mesg;
  611. pad = (4 - (size_t) buff->head) & 3;
  612. /* Send buffer if it is for v0 */
  613. if (buff->head + len + pad > buff->end || m->reserved) {
  614. sb_queue_tail(ipvs, ms);
  615. ms->sync_buff = NULL;
  616. buff = NULL;
  617. pad = 0;
  618. }
  619. }
  620. if (!buff) {
  621. buff = ip_vs_sync_buff_create(ipvs, len);
  622. if (!buff) {
  623. spin_unlock_bh(&ipvs->sync_buff_lock);
  624. pr_err("ip_vs_sync_buff_create failed.\n");
  625. return;
  626. }
  627. ms->sync_buff = buff;
  628. m = buff->mesg;
  629. }
  630. p = buff->head;
  631. buff->head += pad + len;
  632. m->size = htons(ntohs(m->size) + pad + len);
  633. /* Add ev. padding from prev. sync_conn */
  634. while (pad--)
  635. *(p++) = 0;
  636. s = (union ip_vs_sync_conn *)p;
  637. /* Set message type & copy members */
  638. s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0);
  639. s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */
  640. s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED);
  641. s->v4.state = htons(cp->state);
  642. s->v4.protocol = cp->protocol;
  643. s->v4.cport = cp->cport;
  644. s->v4.vport = cp->vport;
  645. s->v4.dport = cp->dport;
  646. s->v4.fwmark = htonl(cp->fwmark);
  647. s->v4.timeout = htonl(cp->timeout / HZ);
  648. m->nr_conns++;
  649. #ifdef CONFIG_IP_VS_IPV6
  650. if (cp->af == AF_INET6) {
  651. p += sizeof(struct ip_vs_sync_v6);
  652. s->v6.caddr = cp->caddr.in6;
  653. s->v6.vaddr = cp->vaddr.in6;
  654. s->v6.daddr = cp->daddr.in6;
  655. } else
  656. #endif
  657. {
  658. p += sizeof(struct ip_vs_sync_v4); /* options ptr */
  659. s->v4.caddr = cp->caddr.ip;
  660. s->v4.vaddr = cp->vaddr.ip;
  661. s->v4.daddr = cp->daddr.ip;
  662. }
  663. if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
  664. *(p++) = IPVS_OPT_SEQ_DATA;
  665. *(p++) = sizeof(struct ip_vs_sync_conn_options);
  666. hton_seq((struct ip_vs_seq *)p, &cp->in_seq);
  667. p += sizeof(struct ip_vs_seq);
  668. hton_seq((struct ip_vs_seq *)p, &cp->out_seq);
  669. p += sizeof(struct ip_vs_seq);
  670. }
  671. /* Handle pe data */
  672. if (cp->pe_data_len && cp->pe_data) {
  673. *(p++) = IPVS_OPT_PE_DATA;
  674. *(p++) = cp->pe_data_len;
  675. memcpy(p, cp->pe_data, cp->pe_data_len);
  676. p += cp->pe_data_len;
  677. if (pe_name_len) {
  678. /* Add PE_NAME */
  679. *(p++) = IPVS_OPT_PE_NAME;
  680. *(p++) = pe_name_len;
  681. memcpy(p, cp->pe->name, pe_name_len);
  682. p += pe_name_len;
  683. }
  684. }
  685. spin_unlock_bh(&ipvs->sync_buff_lock);
  686. control:
  687. /* synchronize its controller if it has */
  688. cp = cp->control;
  689. if (!cp)
  690. return;
  691. if (cp->flags & IP_VS_CONN_F_TEMPLATE)
  692. pkts = atomic_add_return(1, &cp->in_pkts);
  693. else
  694. pkts = sysctl_sync_threshold(ipvs);
  695. goto sloop;
  696. }
  697. /*
  698. * fill_param used by version 1
  699. */
  700. static inline int
  701. ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc,
  702. struct ip_vs_conn_param *p,
  703. __u8 *pe_data, unsigned int pe_data_len,
  704. __u8 *pe_name, unsigned int pe_name_len)
  705. {
  706. #ifdef CONFIG_IP_VS_IPV6
  707. if (af == AF_INET6)
  708. ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol,
  709. (const union nf_inet_addr *)&sc->v6.caddr,
  710. sc->v6.cport,
  711. (const union nf_inet_addr *)&sc->v6.vaddr,
  712. sc->v6.vport, p);
  713. else
  714. #endif
  715. ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol,
  716. (const union nf_inet_addr *)&sc->v4.caddr,
  717. sc->v4.cport,
  718. (const union nf_inet_addr *)&sc->v4.vaddr,
  719. sc->v4.vport, p);
  720. /* Handle pe data */
  721. if (pe_data_len) {
  722. if (pe_name_len) {
  723. char buff[IP_VS_PENAME_MAXLEN+1];
  724. memcpy(buff, pe_name, pe_name_len);
  725. buff[pe_name_len]=0;
  726. p->pe = __ip_vs_pe_getbyname(buff);
  727. if (!p->pe) {
  728. IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
  729. buff);
  730. return 1;
  731. }
  732. } else {
  733. IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
  734. return 1;
  735. }
  736. p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC);
  737. if (!p->pe_data) {
  738. module_put(p->pe->module);
  739. return -ENOMEM;
  740. }
  741. p->pe_data_len = pe_data_len;
  742. }
  743. return 0;
  744. }
  745. /*
  746. * Connection Add / Update.
  747. * Common for version 0 and 1 reception of backup sync_conns.
  748. * Param: ...
  749. * timeout is in sec.
  750. */
  751. static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param,
  752. unsigned int flags, unsigned int state,
  753. unsigned int protocol, unsigned int type,
  754. const union nf_inet_addr *daddr, __be16 dport,
  755. unsigned long timeout, __u32 fwmark,
  756. struct ip_vs_sync_conn_options *opt)
  757. {
  758. struct ip_vs_dest *dest;
  759. struct ip_vs_conn *cp;
  760. if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
  761. cp = ip_vs_conn_in_get(param);
  762. if (cp && ((cp->dport != dport) ||
  763. !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) {
  764. if (!(flags & IP_VS_CONN_F_INACTIVE)) {
  765. ip_vs_conn_expire_now(cp);
  766. __ip_vs_conn_put(cp);
  767. cp = NULL;
  768. } else {
  769. /* This is the expiration message for the
  770. * connection that was already replaced, so we
  771. * just ignore it.
  772. */
  773. __ip_vs_conn_put(cp);
  774. kfree(param->pe_data);
  775. return;
  776. }
  777. }
  778. } else {
  779. cp = ip_vs_ct_in_get(param);
  780. }
  781. if (cp) {
  782. /* Free pe_data */
  783. kfree(param->pe_data);
  784. dest = cp->dest;
  785. spin_lock_bh(&cp->lock);
  786. if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE &&
  787. !(flags & IP_VS_CONN_F_TEMPLATE) && dest) {
  788. if (flags & IP_VS_CONN_F_INACTIVE) {
  789. atomic_dec(&dest->activeconns);
  790. atomic_inc(&dest->inactconns);
  791. } else {
  792. atomic_inc(&dest->activeconns);
  793. atomic_dec(&dest->inactconns);
  794. }
  795. }
  796. flags &= IP_VS_CONN_F_BACKUP_UPD_MASK;
  797. flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK;
  798. cp->flags = flags;
  799. spin_unlock_bh(&cp->lock);
  800. if (!dest)
  801. ip_vs_try_bind_dest(cp);
  802. } else {
  803. /*
  804. * Find the appropriate destination for the connection.
  805. * If it is not found the connection will remain unbound
  806. * but still handled.
  807. */
  808. rcu_read_lock();
  809. /* This function is only invoked by the synchronization
  810. * code. We do not currently support heterogeneous pools
  811. * with synchronization, so we can make the assumption that
  812. * the svc_af is the same as the dest_af
  813. */
  814. dest = ip_vs_find_dest(ipvs, type, type, daddr, dport,
  815. param->vaddr, param->vport, protocol,
  816. fwmark, flags);
  817. cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest,
  818. fwmark);
  819. rcu_read_unlock();
  820. if (!cp) {
  821. kfree(param->pe_data);
  822. IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
  823. return;
  824. }
  825. if (!(flags & IP_VS_CONN_F_TEMPLATE))
  826. kfree(param->pe_data);
  827. }
  828. if (opt)
  829. memcpy(&cp->in_seq, opt, sizeof(*opt));
  830. atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs));
  831. cp->state = state;
  832. cp->old_state = cp->state;
  833. /*
  834. * For Ver 0 messages style
  835. * - Not possible to recover the right timeout for templates
  836. * - can not find the right fwmark
  837. * virtual service. If needed, we can do it for
  838. * non-fwmark persistent services.
  839. * Ver 1 messages style.
  840. * - No problem.
  841. */
  842. if (timeout) {
  843. if (timeout > MAX_SCHEDULE_TIMEOUT / HZ)
  844. timeout = MAX_SCHEDULE_TIMEOUT / HZ;
  845. cp->timeout = timeout*HZ;
  846. } else {
  847. struct ip_vs_proto_data *pd;
  848. pd = ip_vs_proto_data_get(ipvs, protocol);
  849. if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table)
  850. cp->timeout = pd->timeout_table[state];
  851. else
  852. cp->timeout = (3*60*HZ);
  853. }
  854. ip_vs_conn_put(cp);
  855. }
  856. /*
  857. * Process received multicast message for Version 0
  858. */
  859. static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer,
  860. const size_t buflen)
  861. {
  862. struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
  863. struct ip_vs_sync_conn_v0 *s;
  864. struct ip_vs_sync_conn_options *opt;
  865. struct ip_vs_protocol *pp;
  866. struct ip_vs_conn_param param;
  867. char *p;
  868. int i;
  869. p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
  870. for (i=0; i<m->nr_conns; i++) {
  871. unsigned int flags, state;
  872. if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
  873. IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
  874. return;
  875. }
  876. s = (struct ip_vs_sync_conn_v0 *) p;
  877. flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
  878. flags &= ~IP_VS_CONN_F_HASHED;
  879. if (flags & IP_VS_CONN_F_SEQ_MASK) {
  880. opt = (struct ip_vs_sync_conn_options *)&s[1];
  881. p += FULL_CONN_SIZE;
  882. if (p > buffer+buflen) {
  883. IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
  884. return;
  885. }
  886. } else {
  887. opt = NULL;
  888. p += SIMPLE_CONN_SIZE;
  889. }
  890. state = ntohs(s->state);
  891. if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
  892. pp = ip_vs_proto_get(s->protocol);
  893. if (!pp) {
  894. IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
  895. s->protocol);
  896. continue;
  897. }
  898. if (state >= pp->num_states) {
  899. IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
  900. pp->name, state);
  901. continue;
  902. }
  903. } else {
  904. /* protocol in templates is not used for state/timeout */
  905. if (state > 0) {
  906. IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
  907. state);
  908. state = 0;
  909. }
  910. }
  911. ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol,
  912. (const union nf_inet_addr *)&s->caddr,
  913. s->cport,
  914. (const union nf_inet_addr *)&s->vaddr,
  915. s->vport, &param);
  916. /* Send timeout as Zero */
  917. ip_vs_proc_conn(ipvs, &param, flags, state, s->protocol, AF_INET,
  918. (union nf_inet_addr *)&s->daddr, s->dport,
  919. 0, 0, opt);
  920. }
  921. }
  922. /*
  923. * Handle options
  924. */
  925. static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen,
  926. __u32 *opt_flags,
  927. struct ip_vs_sync_conn_options *opt)
  928. {
  929. struct ip_vs_sync_conn_options *topt;
  930. topt = (struct ip_vs_sync_conn_options *)p;
  931. if (plen != sizeof(struct ip_vs_sync_conn_options)) {
  932. IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
  933. return -EINVAL;
  934. }
  935. if (*opt_flags & IPVS_OPT_F_SEQ_DATA) {
  936. IP_VS_DBG(2, "BACKUP, conn options found twice\n");
  937. return -EINVAL;
  938. }
  939. ntoh_seq(&topt->in_seq, &opt->in_seq);
  940. ntoh_seq(&topt->out_seq, &opt->out_seq);
  941. *opt_flags |= IPVS_OPT_F_SEQ_DATA;
  942. return 0;
  943. }
  944. static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
  945. __u8 **data, unsigned int maxlen,
  946. __u32 *opt_flags, __u32 flag)
  947. {
  948. if (plen > maxlen) {
  949. IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen);
  950. return -EINVAL;
  951. }
  952. if (*opt_flags & flag) {
  953. IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag);
  954. return -EINVAL;
  955. }
  956. *data_len = plen;
  957. *data = p;
  958. *opt_flags |= flag;
  959. return 0;
  960. }
  961. /*
  962. * Process a Version 1 sync. connection
  963. */
  964. static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end)
  965. {
  966. struct ip_vs_sync_conn_options opt;
  967. union ip_vs_sync_conn *s;
  968. struct ip_vs_protocol *pp;
  969. struct ip_vs_conn_param param;
  970. __u32 flags;
  971. unsigned int af, state, pe_data_len=0, pe_name_len=0;
  972. __u8 *pe_data=NULL, *pe_name=NULL;
  973. __u32 opt_flags=0;
  974. int retc=0;
  975. s = (union ip_vs_sync_conn *) p;
  976. if (s->v6.type & STYPE_F_INET6) {
  977. #ifdef CONFIG_IP_VS_IPV6
  978. af = AF_INET6;
  979. p += sizeof(struct ip_vs_sync_v6);
  980. #else
  981. IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
  982. retc = 10;
  983. goto out;
  984. #endif
  985. } else if (!s->v4.type) {
  986. af = AF_INET;
  987. p += sizeof(struct ip_vs_sync_v4);
  988. } else {
  989. return -10;
  990. }
  991. if (p > msg_end)
  992. return -20;
  993. /* Process optional params check Type & Len. */
  994. while (p < msg_end) {
  995. int ptype;
  996. int plen;
  997. if (p+2 > msg_end)
  998. return -30;
  999. ptype = *(p++);
  1000. plen = *(p++);
  1001. if (!plen || ((p + plen) > msg_end))
  1002. return -40;
  1003. /* Handle seq option p = param data */
  1004. switch (ptype & ~IPVS_OPT_F_PARAM) {
  1005. case IPVS_OPT_SEQ_DATA:
  1006. if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt))
  1007. return -50;
  1008. break;
  1009. case IPVS_OPT_PE_DATA:
  1010. if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data,
  1011. IP_VS_PEDATA_MAXLEN, &opt_flags,
  1012. IPVS_OPT_F_PE_DATA))
  1013. return -60;
  1014. break;
  1015. case IPVS_OPT_PE_NAME:
  1016. if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name,
  1017. IP_VS_PENAME_MAXLEN, &opt_flags,
  1018. IPVS_OPT_F_PE_NAME))
  1019. return -70;
  1020. break;
  1021. default:
  1022. /* Param data mandatory ? */
  1023. if (!(ptype & IPVS_OPT_F_PARAM)) {
  1024. IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
  1025. ptype & ~IPVS_OPT_F_PARAM);
  1026. retc = 20;
  1027. goto out;
  1028. }
  1029. }
  1030. p += plen; /* Next option */
  1031. }
  1032. /* Get flags and Mask off unsupported */
  1033. flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK;
  1034. flags |= IP_VS_CONN_F_SYNC;
  1035. state = ntohs(s->v4.state);
  1036. if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
  1037. pp = ip_vs_proto_get(s->v4.protocol);
  1038. if (!pp) {
  1039. IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
  1040. s->v4.protocol);
  1041. retc = 30;
  1042. goto out;
  1043. }
  1044. if (state >= pp->num_states) {
  1045. IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
  1046. pp->name, state);
  1047. retc = 40;
  1048. goto out;
  1049. }
  1050. } else {
  1051. /* protocol in templates is not used for state/timeout */
  1052. if (state > 0) {
  1053. IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
  1054. state);
  1055. state = 0;
  1056. }
  1057. }
  1058. if (ip_vs_conn_fill_param_sync(ipvs, af, s, &param, pe_data,
  1059. pe_data_len, pe_name, pe_name_len)) {
  1060. retc = 50;
  1061. goto out;
  1062. }
  1063. /* If only IPv4, just silent skip IPv6 */
  1064. if (af == AF_INET)
  1065. ip_vs_proc_conn(ipvs, &param, flags, state, s->v4.protocol, af,
  1066. (union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
  1067. ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
  1068. (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
  1069. );
  1070. #ifdef CONFIG_IP_VS_IPV6
  1071. else
  1072. ip_vs_proc_conn(ipvs, &param, flags, state, s->v6.protocol, af,
  1073. (union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
  1074. ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
  1075. (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
  1076. );
  1077. #endif
  1078. ip_vs_pe_put(param.pe);
  1079. return 0;
  1080. /* Error exit */
  1081. out:
  1082. IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc);
  1083. return retc;
  1084. }
  1085. /*
  1086. * Process received multicast message and create the corresponding
  1087. * ip_vs_conn entries.
  1088. * Handles Version 0 & 1
  1089. */
  1090. static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer,
  1091. const size_t buflen)
  1092. {
  1093. struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
  1094. __u8 *p, *msg_end;
  1095. int i, nr_conns;
  1096. if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) {
  1097. IP_VS_DBG(2, "BACKUP, message header too short\n");
  1098. return;
  1099. }
  1100. if (buflen != ntohs(m2->size)) {
  1101. IP_VS_DBG(2, "BACKUP, bogus message size\n");
  1102. return;
  1103. }
  1104. /* SyncID sanity check */
  1105. if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) {
  1106. IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
  1107. return;
  1108. }
  1109. /* Handle version 1 message */
  1110. if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0)
  1111. && (m2->spare == 0)) {
  1112. msg_end = buffer + sizeof(struct ip_vs_sync_mesg);
  1113. nr_conns = m2->nr_conns;
  1114. for (i=0; i<nr_conns; i++) {
  1115. union ip_vs_sync_conn *s;
  1116. unsigned int size;
  1117. int retc;
  1118. p = msg_end;
  1119. if (p + sizeof(s->v4) > buffer+buflen) {
  1120. IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
  1121. return;
  1122. }
  1123. s = (union ip_vs_sync_conn *)p;
  1124. size = ntohs(s->v4.ver_size) & SVER_MASK;
  1125. msg_end = p + size;
  1126. /* Basic sanity checks */
  1127. if (msg_end > buffer+buflen) {
  1128. IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
  1129. return;
  1130. }
  1131. if (ntohs(s->v4.ver_size) >> SVER_SHIFT) {
  1132. IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
  1133. ntohs(s->v4.ver_size) >> SVER_SHIFT);
  1134. return;
  1135. }
  1136. /* Process a single sync_conn */
  1137. retc = ip_vs_proc_sync_conn(ipvs, p, msg_end);
  1138. if (retc < 0) {
  1139. IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
  1140. retc);
  1141. return;
  1142. }
  1143. /* Make sure we have 32 bit alignment */
  1144. msg_end = p + ((size + 3) & ~3);
  1145. }
  1146. } else {
  1147. /* Old type of message */
  1148. ip_vs_process_message_v0(ipvs, buffer, buflen);
  1149. return;
  1150. }
  1151. }
  1152. /*
  1153. * Setup sndbuf (mode=1) or rcvbuf (mode=0)
  1154. */
  1155. static void set_sock_size(struct sock *sk, int mode, int val)
  1156. {
  1157. /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */
  1158. /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */
  1159. lock_sock(sk);
  1160. if (mode) {
  1161. val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
  1162. sysctl_wmem_max);
  1163. sk->sk_sndbuf = val * 2;
  1164. sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
  1165. } else {
  1166. val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
  1167. sysctl_rmem_max);
  1168. sk->sk_rcvbuf = val * 2;
  1169. sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
  1170. }
  1171. release_sock(sk);
  1172. }
  1173. /*
  1174. * Setup loopback of outgoing multicasts on a sending socket
  1175. */
  1176. static void set_mcast_loop(struct sock *sk, u_char loop)
  1177. {
  1178. struct inet_sock *inet = inet_sk(sk);
  1179. /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
  1180. lock_sock(sk);
  1181. inet->mc_loop = loop ? 1 : 0;
  1182. #ifdef CONFIG_IP_VS_IPV6
  1183. if (sk->sk_family == AF_INET6) {
  1184. struct ipv6_pinfo *np = inet6_sk(sk);
  1185. /* IPV6_MULTICAST_LOOP */
  1186. np->mc_loop = loop ? 1 : 0;
  1187. }
  1188. #endif
  1189. release_sock(sk);
  1190. }
  1191. /*
  1192. * Specify TTL for outgoing multicasts on a sending socket
  1193. */
  1194. static void set_mcast_ttl(struct sock *sk, u_char ttl)
  1195. {
  1196. struct inet_sock *inet = inet_sk(sk);
  1197. /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
  1198. lock_sock(sk);
  1199. inet->mc_ttl = ttl;
  1200. #ifdef CONFIG_IP_VS_IPV6
  1201. if (sk->sk_family == AF_INET6) {
  1202. struct ipv6_pinfo *np = inet6_sk(sk);
  1203. /* IPV6_MULTICAST_HOPS */
  1204. np->mcast_hops = ttl;
  1205. }
  1206. #endif
  1207. release_sock(sk);
  1208. }
  1209. /* Control fragmentation of messages */
  1210. static void set_mcast_pmtudisc(struct sock *sk, int val)
  1211. {
  1212. struct inet_sock *inet = inet_sk(sk);
  1213. /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */
  1214. lock_sock(sk);
  1215. inet->pmtudisc = val;
  1216. #ifdef CONFIG_IP_VS_IPV6
  1217. if (sk->sk_family == AF_INET6) {
  1218. struct ipv6_pinfo *np = inet6_sk(sk);
  1219. /* IPV6_MTU_DISCOVER */
  1220. np->pmtudisc = val;
  1221. }
  1222. #endif
  1223. release_sock(sk);
  1224. }
  1225. /*
  1226. * Specifiy default interface for outgoing multicasts
  1227. */
  1228. static int set_mcast_if(struct sock *sk, struct net_device *dev)
  1229. {
  1230. struct inet_sock *inet = inet_sk(sk);
  1231. if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
  1232. return -EINVAL;
  1233. lock_sock(sk);
  1234. inet->mc_index = dev->ifindex;
  1235. /* inet->mc_addr = 0; */
  1236. #ifdef CONFIG_IP_VS_IPV6
  1237. if (sk->sk_family == AF_INET6) {
  1238. struct ipv6_pinfo *np = inet6_sk(sk);
  1239. /* IPV6_MULTICAST_IF */
  1240. np->mcast_oif = dev->ifindex;
  1241. }
  1242. #endif
  1243. release_sock(sk);
  1244. return 0;
  1245. }
  1246. /*
  1247. * Join a multicast group.
  1248. * the group is specified by a class D multicast address 224.0.0.0/8
  1249. * in the in_addr structure passed in as a parameter.
  1250. */
  1251. static int
  1252. join_mcast_group(struct sock *sk, struct in_addr *addr, struct net_device *dev)
  1253. {
  1254. struct ip_mreqn mreq;
  1255. int ret;
  1256. memset(&mreq, 0, sizeof(mreq));
  1257. memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
  1258. if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
  1259. return -EINVAL;
  1260. mreq.imr_ifindex = dev->ifindex;
  1261. lock_sock(sk);
  1262. ret = ip_mc_join_group(sk, &mreq);
  1263. release_sock(sk);
  1264. return ret;
  1265. }
  1266. #ifdef CONFIG_IP_VS_IPV6
  1267. static int join_mcast_group6(struct sock *sk, struct in6_addr *addr,
  1268. struct net_device *dev)
  1269. {
  1270. int ret;
  1271. if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
  1272. return -EINVAL;
  1273. lock_sock(sk);
  1274. ret = ipv6_sock_mc_join(sk, dev->ifindex, addr);
  1275. release_sock(sk);
  1276. return ret;
  1277. }
  1278. #endif
  1279. static int bind_mcastif_addr(struct socket *sock, struct net_device *dev)
  1280. {
  1281. __be32 addr;
  1282. struct sockaddr_in sin;
  1283. addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
  1284. if (!addr)
  1285. pr_err("You probably need to specify IP address on "
  1286. "multicast interface.\n");
  1287. IP_VS_DBG(7, "binding socket with (%s) %pI4\n",
  1288. dev->name, &addr);
  1289. /* Now bind the socket with the address of multicast interface */
  1290. sin.sin_family = AF_INET;
  1291. sin.sin_addr.s_addr = addr;
  1292. sin.sin_port = 0;
  1293. return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
  1294. }
  1295. static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen,
  1296. struct ipvs_sync_daemon_cfg *c, int id)
  1297. {
  1298. if (AF_INET6 == c->mcast_af) {
  1299. sa->in6 = (struct sockaddr_in6) {
  1300. .sin6_family = AF_INET6,
  1301. .sin6_port = htons(c->mcast_port + id),
  1302. };
  1303. sa->in6.sin6_addr = c->mcast_group.in6;
  1304. *salen = sizeof(sa->in6);
  1305. } else {
  1306. sa->in = (struct sockaddr_in) {
  1307. .sin_family = AF_INET,
  1308. .sin_port = htons(c->mcast_port + id),
  1309. };
  1310. sa->in.sin_addr = c->mcast_group.in;
  1311. *salen = sizeof(sa->in);
  1312. }
  1313. }
  1314. /*
  1315. * Set up sending multicast socket over UDP
  1316. */
  1317. static int make_send_sock(struct netns_ipvs *ipvs, int id,
  1318. struct net_device *dev, struct socket **sock_ret)
  1319. {
  1320. /* multicast addr */
  1321. union ipvs_sockaddr mcast_addr;
  1322. struct socket *sock;
  1323. int result, salen;
  1324. /* First create a socket */
  1325. result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM,
  1326. IPPROTO_UDP, &sock);
  1327. if (result < 0) {
  1328. pr_err("Error during creation of socket; terminating\n");
  1329. goto error;
  1330. }
  1331. *sock_ret = sock;
  1332. result = set_mcast_if(sock->sk, dev);
  1333. if (result < 0) {
  1334. pr_err("Error setting outbound mcast interface\n");
  1335. goto error;
  1336. }
  1337. set_mcast_loop(sock->sk, 0);
  1338. set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl);
  1339. /* Allow fragmentation if MTU changes */
  1340. set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT);
  1341. result = sysctl_sync_sock_size(ipvs);
  1342. if (result > 0)
  1343. set_sock_size(sock->sk, 1, result);
  1344. if (AF_INET == ipvs->mcfg.mcast_af)
  1345. result = bind_mcastif_addr(sock, dev);
  1346. else
  1347. result = 0;
  1348. if (result < 0) {
  1349. pr_err("Error binding address of the mcast interface\n");
  1350. goto error;
  1351. }
  1352. get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id);
  1353. result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
  1354. salen, 0);
  1355. if (result < 0) {
  1356. pr_err("Error connecting to the multicast addr\n");
  1357. goto error;
  1358. }
  1359. return 0;
  1360. error:
  1361. return result;
  1362. }
  1363. /*
  1364. * Set up receiving multicast socket over UDP
  1365. */
  1366. static int make_receive_sock(struct netns_ipvs *ipvs, int id,
  1367. struct net_device *dev, struct socket **sock_ret)
  1368. {
  1369. /* multicast addr */
  1370. union ipvs_sockaddr mcast_addr;
  1371. struct socket *sock;
  1372. int result, salen;
  1373. /* First create a socket */
  1374. result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM,
  1375. IPPROTO_UDP, &sock);
  1376. if (result < 0) {
  1377. pr_err("Error during creation of socket; terminating\n");
  1378. goto error;
  1379. }
  1380. *sock_ret = sock;
  1381. /* it is equivalent to the REUSEADDR option in user-space */
  1382. sock->sk->sk_reuse = SK_CAN_REUSE;
  1383. result = sysctl_sync_sock_size(ipvs);
  1384. if (result > 0)
  1385. set_sock_size(sock->sk, 0, result);
  1386. get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id);
  1387. sock->sk->sk_bound_dev_if = dev->ifindex;
  1388. result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen);
  1389. if (result < 0) {
  1390. pr_err("Error binding to the multicast addr\n");
  1391. goto error;
  1392. }
  1393. /* join the multicast group */
  1394. #ifdef CONFIG_IP_VS_IPV6
  1395. if (ipvs->bcfg.mcast_af == AF_INET6)
  1396. result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr,
  1397. dev);
  1398. else
  1399. #endif
  1400. result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr,
  1401. dev);
  1402. if (result < 0) {
  1403. pr_err("Error joining to the multicast group\n");
  1404. goto error;
  1405. }
  1406. return 0;
  1407. error:
  1408. return result;
  1409. }
  1410. static int
  1411. ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
  1412. {
  1413. struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
  1414. struct kvec iov;
  1415. int len;
  1416. EnterFunction(7);
  1417. iov.iov_base = (void *)buffer;
  1418. iov.iov_len = length;
  1419. len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
  1420. LeaveFunction(7);
  1421. return len;
  1422. }
  1423. static int
  1424. ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
  1425. {
  1426. int msize;
  1427. int ret;
  1428. msize = ntohs(msg->size);
  1429. ret = ip_vs_send_async(sock, (char *)msg, msize);
  1430. if (ret >= 0 || ret == -EAGAIN)
  1431. return ret;
  1432. pr_err("ip_vs_send_async error %d\n", ret);
  1433. return 0;
  1434. }
  1435. static int
  1436. ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
  1437. {
  1438. struct msghdr msg = {NULL,};
  1439. struct kvec iov;
  1440. int len;
  1441. EnterFunction(7);
  1442. /* Receive a packet */
  1443. iov.iov_base = buffer;
  1444. iov.iov_len = (size_t)buflen;
  1445. len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, MSG_DONTWAIT);
  1446. if (len < 0)
  1447. return len;
  1448. LeaveFunction(7);
  1449. return len;
  1450. }
  1451. /* Wakeup the master thread for sending */
  1452. static void master_wakeup_work_handler(struct work_struct *work)
  1453. {
  1454. struct ipvs_master_sync_state *ms =
  1455. container_of(work, struct ipvs_master_sync_state,
  1456. master_wakeup_work.work);
  1457. struct netns_ipvs *ipvs = ms->ipvs;
  1458. spin_lock_bh(&ipvs->sync_lock);
  1459. if (ms->sync_queue_len &&
  1460. ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) {
  1461. ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE;
  1462. wake_up_process(ms->master_thread);
  1463. }
  1464. spin_unlock_bh(&ipvs->sync_lock);
  1465. }
  1466. /* Get next buffer to send */
  1467. static inline struct ip_vs_sync_buff *
  1468. next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
  1469. {
  1470. struct ip_vs_sync_buff *sb;
  1471. sb = sb_dequeue(ipvs, ms);
  1472. if (sb)
  1473. return sb;
  1474. /* Do not delay entries in buffer for more than 2 seconds */
  1475. return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME);
  1476. }
  1477. static int sync_thread_master(void *data)
  1478. {
  1479. struct ip_vs_sync_thread_data *tinfo = data;
  1480. struct netns_ipvs *ipvs = tinfo->ipvs;
  1481. struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id];
  1482. struct sock *sk = tinfo->sock->sk;
  1483. struct ip_vs_sync_buff *sb;
  1484. pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
  1485. "syncid = %d, id = %d\n",
  1486. ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id);
  1487. for (;;) {
  1488. sb = next_sync_buff(ipvs, ms);
  1489. if (unlikely(kthread_should_stop()))
  1490. break;
  1491. if (!sb) {
  1492. schedule_timeout(IPVS_SYNC_CHECK_PERIOD);
  1493. continue;
  1494. }
  1495. while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) {
  1496. /* (Ab)use interruptible sleep to avoid increasing
  1497. * the load avg.
  1498. */
  1499. __wait_event_interruptible(*sk_sleep(sk),
  1500. sock_writeable(sk) ||
  1501. kthread_should_stop());
  1502. if (unlikely(kthread_should_stop()))
  1503. goto done;
  1504. }
  1505. ip_vs_sync_buff_release(sb);
  1506. }
  1507. done:
  1508. __set_current_state(TASK_RUNNING);
  1509. if (sb)
  1510. ip_vs_sync_buff_release(sb);
  1511. /* clean up the sync_buff queue */
  1512. while ((sb = sb_dequeue(ipvs, ms)))
  1513. ip_vs_sync_buff_release(sb);
  1514. __set_current_state(TASK_RUNNING);
  1515. /* clean up the current sync_buff */
  1516. sb = get_curr_sync_buff(ipvs, ms, 0);
  1517. if (sb)
  1518. ip_vs_sync_buff_release(sb);
  1519. /* release the sending multicast socket */
  1520. sock_release(tinfo->sock);
  1521. kfree(tinfo);
  1522. return 0;
  1523. }
  1524. static int sync_thread_backup(void *data)
  1525. {
  1526. struct ip_vs_sync_thread_data *tinfo = data;
  1527. struct netns_ipvs *ipvs = tinfo->ipvs;
  1528. int len;
  1529. pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
  1530. "syncid = %d, id = %d\n",
  1531. ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id);
  1532. while (!kthread_should_stop()) {
  1533. wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
  1534. !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
  1535. || kthread_should_stop());
  1536. /* do we have data now? */
  1537. while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
  1538. len = ip_vs_receive(tinfo->sock, tinfo->buf,
  1539. ipvs->bcfg.sync_maxlen);
  1540. if (len <= 0) {
  1541. if (len != -EAGAIN)
  1542. pr_err("receiving message error\n");
  1543. break;
  1544. }
  1545. ip_vs_process_message(ipvs, tinfo->buf, len);
  1546. }
  1547. }
  1548. /* release the sending multicast socket */
  1549. sock_release(tinfo->sock);
  1550. kfree(tinfo->buf);
  1551. kfree(tinfo);
  1552. return 0;
  1553. }
  1554. int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
  1555. int state)
  1556. {
  1557. struct ip_vs_sync_thread_data *tinfo = NULL;
  1558. struct task_struct **array = NULL, *task;
  1559. struct net_device *dev;
  1560. char *name;
  1561. int (*threadfn)(void *data);
  1562. int id = 0, count, hlen;
  1563. int result = -ENOMEM;
  1564. u16 mtu, min_mtu;
  1565. IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
  1566. IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
  1567. sizeof(struct ip_vs_sync_conn_v0));
  1568. /* Do not hold one mutex and then to block on another */
  1569. for (;;) {
  1570. rtnl_lock();
  1571. if (mutex_trylock(&ipvs->sync_mutex))
  1572. break;
  1573. rtnl_unlock();
  1574. mutex_lock(&ipvs->sync_mutex);
  1575. if (rtnl_trylock())
  1576. break;
  1577. mutex_unlock(&ipvs->sync_mutex);
  1578. }
  1579. if (!ipvs->sync_state) {
  1580. count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX);
  1581. ipvs->threads_mask = count - 1;
  1582. } else
  1583. count = ipvs->threads_mask + 1;
  1584. if (c->mcast_af == AF_UNSPEC) {
  1585. c->mcast_af = AF_INET;
  1586. c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP);
  1587. }
  1588. if (!c->mcast_port)
  1589. c->mcast_port = IP_VS_SYNC_PORT;
  1590. if (!c->mcast_ttl)
  1591. c->mcast_ttl = 1;
  1592. dev = __dev_get_by_name(ipvs->net, c->mcast_ifn);
  1593. if (!dev) {
  1594. pr_err("Unknown mcast interface: %s\n", c->mcast_ifn);
  1595. result = -ENODEV;
  1596. goto out_early;
  1597. }
  1598. hlen = (AF_INET6 == c->mcast_af) ?
  1599. sizeof(struct ipv6hdr) + sizeof(struct udphdr) :
  1600. sizeof(struct iphdr) + sizeof(struct udphdr);
  1601. mtu = (state == IP_VS_STATE_BACKUP) ?
  1602. clamp(dev->mtu, 1500U, 65535U) : 1500U;
  1603. min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1;
  1604. if (c->sync_maxlen)
  1605. c->sync_maxlen = clamp_t(unsigned int,
  1606. c->sync_maxlen, min_mtu,
  1607. 65535 - hlen);
  1608. else
  1609. c->sync_maxlen = mtu - hlen;
  1610. if (state == IP_VS_STATE_MASTER) {
  1611. result = -EEXIST;
  1612. if (ipvs->ms)
  1613. goto out_early;
  1614. ipvs->mcfg = *c;
  1615. name = "ipvs-m:%d:%d";
  1616. threadfn = sync_thread_master;
  1617. } else if (state == IP_VS_STATE_BACKUP) {
  1618. result = -EEXIST;
  1619. if (ipvs->backup_threads)
  1620. goto out_early;
  1621. ipvs->bcfg = *c;
  1622. name = "ipvs-b:%d:%d";
  1623. threadfn = sync_thread_backup;
  1624. } else {
  1625. result = -EINVAL;
  1626. goto out_early;
  1627. }
  1628. if (state == IP_VS_STATE_MASTER) {
  1629. struct ipvs_master_sync_state *ms;
  1630. result = -ENOMEM;
  1631. ipvs->ms = kzalloc(count * sizeof(ipvs->ms[0]), GFP_KERNEL);
  1632. if (!ipvs->ms)
  1633. goto out;
  1634. ms = ipvs->ms;
  1635. for (id = 0; id < count; id++, ms++) {
  1636. INIT_LIST_HEAD(&ms->sync_queue);
  1637. ms->sync_queue_len = 0;
  1638. ms->sync_queue_delay = 0;
  1639. INIT_DELAYED_WORK(&ms->master_wakeup_work,
  1640. master_wakeup_work_handler);
  1641. ms->ipvs = ipvs;
  1642. }
  1643. } else {
  1644. array = kzalloc(count * sizeof(struct task_struct *),
  1645. GFP_KERNEL);
  1646. result = -ENOMEM;
  1647. if (!array)
  1648. goto out;
  1649. }
  1650. for (id = 0; id < count; id++) {
  1651. result = -ENOMEM;
  1652. tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
  1653. if (!tinfo)
  1654. goto out;
  1655. tinfo->ipvs = ipvs;
  1656. tinfo->sock = NULL;
  1657. if (state == IP_VS_STATE_BACKUP) {
  1658. tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen,
  1659. GFP_KERNEL);
  1660. if (!tinfo->buf)
  1661. goto out;
  1662. } else {
  1663. tinfo->buf = NULL;
  1664. }
  1665. tinfo->id = id;
  1666. if (state == IP_VS_STATE_MASTER)
  1667. result = make_send_sock(ipvs, id, dev, &tinfo->sock);
  1668. else
  1669. result = make_receive_sock(ipvs, id, dev, &tinfo->sock);
  1670. if (result < 0)
  1671. goto out;
  1672. task = kthread_run(threadfn, tinfo, name, ipvs->gen, id);
  1673. if (IS_ERR(task)) {
  1674. result = PTR_ERR(task);
  1675. goto out;
  1676. }
  1677. tinfo = NULL;
  1678. if (state == IP_VS_STATE_MASTER)
  1679. ipvs->ms[id].master_thread = task;
  1680. else
  1681. array[id] = task;
  1682. }
  1683. /* mark as active */
  1684. if (state == IP_VS_STATE_BACKUP)
  1685. ipvs->backup_threads = array;
  1686. spin_lock_bh(&ipvs->sync_buff_lock);
  1687. ipvs->sync_state |= state;
  1688. spin_unlock_bh(&ipvs->sync_buff_lock);
  1689. mutex_unlock(&ipvs->sync_mutex);
  1690. rtnl_unlock();
  1691. /* increase the module use count */
  1692. ip_vs_use_count_inc();
  1693. return 0;
  1694. out:
  1695. /* We do not need RTNL lock anymore, release it here so that
  1696. * sock_release below and in the kthreads can use rtnl_lock
  1697. * to leave the mcast group.
  1698. */
  1699. rtnl_unlock();
  1700. count = id;
  1701. while (count-- > 0) {
  1702. if (state == IP_VS_STATE_MASTER)
  1703. kthread_stop(ipvs->ms[count].master_thread);
  1704. else
  1705. kthread_stop(array[count]);
  1706. }
  1707. if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
  1708. kfree(ipvs->ms);
  1709. ipvs->ms = NULL;
  1710. }
  1711. mutex_unlock(&ipvs->sync_mutex);
  1712. if (tinfo) {
  1713. if (tinfo->sock)
  1714. sock_release(tinfo->sock);
  1715. kfree(tinfo->buf);
  1716. kfree(tinfo);
  1717. }
  1718. kfree(array);
  1719. return result;
  1720. out_early:
  1721. mutex_unlock(&ipvs->sync_mutex);
  1722. rtnl_unlock();
  1723. return result;
  1724. }
  1725. int stop_sync_thread(struct netns_ipvs *ipvs, int state)
  1726. {
  1727. struct task_struct **array;
  1728. int id;
  1729. int retc = -EINVAL;
  1730. IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
  1731. if (state == IP_VS_STATE_MASTER) {
  1732. if (!ipvs->ms)
  1733. return -ESRCH;
  1734. /*
  1735. * The lock synchronizes with sb_queue_tail(), so that we don't
  1736. * add sync buffers to the queue, when we are already in
  1737. * progress of stopping the master sync daemon.
  1738. */
  1739. spin_lock_bh(&ipvs->sync_buff_lock);
  1740. spin_lock(&ipvs->sync_lock);
  1741. ipvs->sync_state &= ~IP_VS_STATE_MASTER;
  1742. spin_unlock(&ipvs->sync_lock);
  1743. spin_unlock_bh(&ipvs->sync_buff_lock);
  1744. retc = 0;
  1745. for (id = ipvs->threads_mask; id >= 0; id--) {
  1746. struct ipvs_master_sync_state *ms = &ipvs->ms[id];
  1747. int ret;
  1748. pr_info("stopping master sync thread %d ...\n",
  1749. task_pid_nr(ms->master_thread));
  1750. cancel_delayed_work_sync(&ms->master_wakeup_work);
  1751. ret = kthread_stop(ms->master_thread);
  1752. if (retc >= 0)
  1753. retc = ret;
  1754. }
  1755. kfree(ipvs->ms);
  1756. ipvs->ms = NULL;
  1757. } else if (state == IP_VS_STATE_BACKUP) {
  1758. if (!ipvs->backup_threads)
  1759. return -ESRCH;
  1760. ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
  1761. array = ipvs->backup_threads;
  1762. retc = 0;
  1763. for (id = ipvs->threads_mask; id >= 0; id--) {
  1764. int ret;
  1765. pr_info("stopping backup sync thread %d ...\n",
  1766. task_pid_nr(array[id]));
  1767. ret = kthread_stop(array[id]);
  1768. if (retc >= 0)
  1769. retc = ret;
  1770. }
  1771. kfree(array);
  1772. ipvs->backup_threads = NULL;
  1773. }
  1774. /* decrease the module use count */
  1775. ip_vs_use_count_dec();
  1776. return retc;
  1777. }
  1778. /*
  1779. * Initialize data struct for each netns
  1780. */
  1781. int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs)
  1782. {
  1783. __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key);
  1784. spin_lock_init(&ipvs->sync_lock);
  1785. spin_lock_init(&ipvs->sync_buff_lock);
  1786. return 0;
  1787. }
  1788. void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs)
  1789. {
  1790. int retc;
  1791. mutex_lock(&ipvs->sync_mutex);
  1792. retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER);
  1793. if (retc && retc != -ESRCH)
  1794. pr_err("Failed to stop Master Daemon\n");
  1795. retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP);
  1796. if (retc && retc != -ESRCH)
  1797. pr_err("Failed to stop Backup Daemon\n");
  1798. mutex_unlock(&ipvs->sync_mutex);
  1799. }