ip_vs_xmit.c 36 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433
  1. /*
  2. * ip_vs_xmit.c: various packet transmitters for IPVS
  3. *
  4. * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
  5. * Julian Anastasov <ja@ssi.bg>
  6. *
  7. * This program is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU General Public License
  9. * as published by the Free Software Foundation; either version
  10. * 2 of the License, or (at your option) any later version.
  11. *
  12. * Changes:
  13. *
  14. * Description of forwarding methods:
  15. * - all transmitters are called from LOCAL_IN (remote clients) and
  16. * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
  17. * - not all connections have destination server, for example,
  18. * connections in backup server when fwmark is used
  19. * - bypass connections use daddr from packet
  20. * - we can use dst without ref while sending in RCU section, we use
  21. * ref when returning NF_ACCEPT for NAT-ed packet via loopback
  22. * LOCAL_OUT rules:
  23. * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
  24. * - skb->pkt_type is not set yet
  25. * - the only place where we can see skb->sk != NULL
  26. */
  27. #define KMSG_COMPONENT "IPVS"
  28. #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  29. #include <linux/kernel.h>
  30. #include <linux/slab.h>
  31. #include <linux/tcp.h> /* for tcphdr */
  32. #include <net/ip.h>
  33. #include <net/tcp.h> /* for csum_tcpudp_magic */
  34. #include <net/udp.h>
  35. #include <net/icmp.h> /* for icmp_send */
  36. #include <net/route.h> /* for ip_route_output */
  37. #include <net/ipv6.h>
  38. #include <net/ip6_route.h>
  39. #include <net/ip_tunnels.h>
  40. #include <net/addrconf.h>
  41. #include <linux/icmpv6.h>
  42. #include <linux/netfilter.h>
  43. #include <linux/netfilter_ipv4.h>
  44. #include <net/ip_vs.h>
  45. enum {
  46. IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */
  47. IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */
  48. IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to
  49. * local
  50. */
  51. IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */
  52. IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */
  53. IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */
  54. };
  55. static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void)
  56. {
  57. return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC);
  58. }
  59. static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst)
  60. {
  61. kfree(dest_dst);
  62. }
  63. /*
  64. * Destination cache to speed up outgoing route lookup
  65. */
  66. static inline void
  67. __ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst,
  68. struct dst_entry *dst, u32 dst_cookie)
  69. {
  70. struct ip_vs_dest_dst *old;
  71. old = rcu_dereference_protected(dest->dest_dst,
  72. lockdep_is_held(&dest->dst_lock));
  73. if (dest_dst) {
  74. dest_dst->dst_cache = dst;
  75. dest_dst->dst_cookie = dst_cookie;
  76. }
  77. rcu_assign_pointer(dest->dest_dst, dest_dst);
  78. if (old)
  79. call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
  80. }
  81. static inline struct ip_vs_dest_dst *
  82. __ip_vs_dst_check(struct ip_vs_dest *dest)
  83. {
  84. struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst);
  85. struct dst_entry *dst;
  86. if (!dest_dst)
  87. return NULL;
  88. dst = dest_dst->dst_cache;
  89. if (dst->obsolete &&
  90. dst->ops->check(dst, dest_dst->dst_cookie) == NULL)
  91. return NULL;
  92. return dest_dst;
  93. }
  94. static inline bool
  95. __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
  96. {
  97. if (IP6CB(skb)->frag_max_size) {
  98. /* frag_max_size tell us that, this packet have been
  99. * defragmented by netfilter IPv6 conntrack module.
  100. */
  101. if (IP6CB(skb)->frag_max_size > mtu)
  102. return true; /* largest fragment violate MTU */
  103. }
  104. else if (skb->len > mtu && !skb_is_gso(skb)) {
  105. return true; /* Packet size violate MTU size */
  106. }
  107. return false;
  108. }
  109. /* Get route to daddr, update *saddr, optionally bind route to saddr */
  110. static struct rtable *do_output_route4(struct net *net, __be32 daddr,
  111. int rt_mode, __be32 *saddr)
  112. {
  113. struct flowi4 fl4;
  114. struct rtable *rt;
  115. int loop = 0;
  116. memset(&fl4, 0, sizeof(fl4));
  117. fl4.daddr = daddr;
  118. fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ?
  119. FLOWI_FLAG_KNOWN_NH : 0;
  120. retry:
  121. rt = ip_route_output_key(net, &fl4);
  122. if (IS_ERR(rt)) {
  123. /* Invalid saddr ? */
  124. if (PTR_ERR(rt) == -EINVAL && *saddr &&
  125. rt_mode & IP_VS_RT_MODE_CONNECT && !loop) {
  126. *saddr = 0;
  127. flowi4_update_output(&fl4, 0, 0, daddr, 0);
  128. goto retry;
  129. }
  130. IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
  131. return NULL;
  132. } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
  133. ip_rt_put(rt);
  134. *saddr = fl4.saddr;
  135. flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr);
  136. loop++;
  137. goto retry;
  138. }
  139. *saddr = fl4.saddr;
  140. return rt;
  141. }
  142. #ifdef CONFIG_IP_VS_IPV6
  143. static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
  144. {
  145. return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK;
  146. }
  147. #endif
  148. static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb,
  149. int rt_mode,
  150. bool new_rt_is_local)
  151. {
  152. bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL);
  153. bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL);
  154. bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR);
  155. bool source_is_loopback;
  156. bool old_rt_is_local;
  157. #ifdef CONFIG_IP_VS_IPV6
  158. if (skb_af == AF_INET6) {
  159. int addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr);
  160. source_is_loopback =
  161. (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
  162. (addr_type & IPV6_ADDR_LOOPBACK);
  163. old_rt_is_local = __ip_vs_is_local_route6(
  164. (struct rt6_info *)skb_dst(skb));
  165. } else
  166. #endif
  167. {
  168. source_is_loopback = ipv4_is_loopback(ip_hdr(skb)->saddr);
  169. old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
  170. }
  171. if (unlikely(new_rt_is_local)) {
  172. if (!rt_mode_allow_local)
  173. return true;
  174. if (!rt_mode_allow_redirect && !old_rt_is_local)
  175. return true;
  176. } else {
  177. if (!rt_mode_allow_non_local)
  178. return true;
  179. if (source_is_loopback)
  180. return true;
  181. }
  182. return false;
  183. }
  184. static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu)
  185. {
  186. struct sock *sk = skb->sk;
  187. struct rtable *ort = skb_rtable(skb);
  188. if (!skb->dev && sk && sk_fullsock(sk))
  189. ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
  190. }
  191. static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af,
  192. int rt_mode,
  193. struct ip_vs_iphdr *ipvsh,
  194. struct sk_buff *skb, int mtu)
  195. {
  196. #ifdef CONFIG_IP_VS_IPV6
  197. if (skb_af == AF_INET6) {
  198. struct net *net = ipvs->net;
  199. if (unlikely(__mtu_check_toobig_v6(skb, mtu))) {
  200. if (!skb->dev)
  201. skb->dev = net->loopback_dev;
  202. /* only send ICMP too big on first fragment */
  203. if (!ipvsh->fragoffs && !ip_vs_iph_icmp(ipvsh))
  204. icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
  205. IP_VS_DBG(1, "frag needed for %pI6c\n",
  206. &ipv6_hdr(skb)->saddr);
  207. return false;
  208. }
  209. } else
  210. #endif
  211. {
  212. /* If we're going to tunnel the packet and pmtu discovery
  213. * is disabled, we'll just fragment it anyway
  214. */
  215. if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs))
  216. return true;
  217. if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) &&
  218. skb->len > mtu && !skb_is_gso(skb) &&
  219. !ip_vs_iph_icmp(ipvsh))) {
  220. icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
  221. htonl(mtu));
  222. IP_VS_DBG(1, "frag needed for %pI4\n",
  223. &ip_hdr(skb)->saddr);
  224. return false;
  225. }
  226. }
  227. return true;
  228. }
  229. /* Get route to destination or remote server */
  230. static int
  231. __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
  232. struct ip_vs_dest *dest,
  233. __be32 daddr, int rt_mode, __be32 *ret_saddr,
  234. struct ip_vs_iphdr *ipvsh)
  235. {
  236. struct net *net = ipvs->net;
  237. struct ip_vs_dest_dst *dest_dst;
  238. struct rtable *rt; /* Route to the other host */
  239. int mtu;
  240. int local, noref = 1;
  241. if (dest) {
  242. dest_dst = __ip_vs_dst_check(dest);
  243. if (likely(dest_dst))
  244. rt = (struct rtable *) dest_dst->dst_cache;
  245. else {
  246. dest_dst = ip_vs_dest_dst_alloc();
  247. spin_lock_bh(&dest->dst_lock);
  248. if (!dest_dst) {
  249. __ip_vs_dst_set(dest, NULL, NULL, 0);
  250. spin_unlock_bh(&dest->dst_lock);
  251. goto err_unreach;
  252. }
  253. rt = do_output_route4(net, dest->addr.ip, rt_mode,
  254. &dest_dst->dst_saddr.ip);
  255. if (!rt) {
  256. __ip_vs_dst_set(dest, NULL, NULL, 0);
  257. spin_unlock_bh(&dest->dst_lock);
  258. ip_vs_dest_dst_free(dest_dst);
  259. goto err_unreach;
  260. }
  261. __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0);
  262. spin_unlock_bh(&dest->dst_lock);
  263. IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
  264. &dest->addr.ip, &dest_dst->dst_saddr.ip,
  265. atomic_read(&rt->dst.__refcnt));
  266. }
  267. if (ret_saddr)
  268. *ret_saddr = dest_dst->dst_saddr.ip;
  269. } else {
  270. __be32 saddr = htonl(INADDR_ANY);
  271. noref = 0;
  272. /* For such unconfigured boxes avoid many route lookups
  273. * for performance reasons because we do not remember saddr
  274. */
  275. rt_mode &= ~IP_VS_RT_MODE_CONNECT;
  276. rt = do_output_route4(net, daddr, rt_mode, &saddr);
  277. if (!rt)
  278. goto err_unreach;
  279. if (ret_saddr)
  280. *ret_saddr = saddr;
  281. }
  282. local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0;
  283. if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
  284. local))) {
  285. IP_VS_DBG_RL("We are crossing local and non-local addresses"
  286. " daddr=%pI4\n", &daddr);
  287. goto err_put;
  288. }
  289. if (unlikely(local)) {
  290. /* skb to local stack, preserve old route */
  291. if (!noref)
  292. ip_rt_put(rt);
  293. return local;
  294. }
  295. if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) {
  296. mtu = dst_mtu(&rt->dst);
  297. } else {
  298. mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
  299. if (mtu < 68) {
  300. IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
  301. goto err_put;
  302. }
  303. maybe_update_pmtu(skb_af, skb, mtu);
  304. }
  305. if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu))
  306. goto err_put;
  307. skb_dst_drop(skb);
  308. if (noref) {
  309. if (!local)
  310. skb_dst_set_noref(skb, &rt->dst);
  311. else
  312. skb_dst_set(skb, dst_clone(&rt->dst));
  313. } else
  314. skb_dst_set(skb, &rt->dst);
  315. return local;
  316. err_put:
  317. if (!noref)
  318. ip_rt_put(rt);
  319. return -1;
  320. err_unreach:
  321. dst_link_failure(skb);
  322. return -1;
  323. }
  324. #ifdef CONFIG_IP_VS_IPV6
  325. static struct dst_entry *
  326. __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
  327. struct in6_addr *ret_saddr, int do_xfrm, int rt_mode)
  328. {
  329. struct dst_entry *dst;
  330. struct flowi6 fl6 = {
  331. .daddr = *daddr,
  332. };
  333. if (rt_mode & IP_VS_RT_MODE_KNOWN_NH)
  334. fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
  335. dst = ip6_route_output(net, NULL, &fl6);
  336. if (dst->error)
  337. goto out_err;
  338. if (!ret_saddr)
  339. return dst;
  340. if (ipv6_addr_any(&fl6.saddr) &&
  341. ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
  342. &fl6.daddr, 0, &fl6.saddr) < 0)
  343. goto out_err;
  344. if (do_xfrm) {
  345. dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
  346. if (IS_ERR(dst)) {
  347. dst = NULL;
  348. goto out_err;
  349. }
  350. }
  351. *ret_saddr = fl6.saddr;
  352. return dst;
  353. out_err:
  354. dst_release(dst);
  355. IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
  356. return NULL;
  357. }
  358. /*
  359. * Get route to destination or remote server
  360. */
  361. static int
  362. __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
  363. struct ip_vs_dest *dest,
  364. struct in6_addr *daddr, struct in6_addr *ret_saddr,
  365. struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode)
  366. {
  367. struct net *net = ipvs->net;
  368. struct ip_vs_dest_dst *dest_dst;
  369. struct rt6_info *rt; /* Route to the other host */
  370. struct dst_entry *dst;
  371. int mtu;
  372. int local, noref = 1;
  373. if (dest) {
  374. dest_dst = __ip_vs_dst_check(dest);
  375. if (likely(dest_dst))
  376. rt = (struct rt6_info *) dest_dst->dst_cache;
  377. else {
  378. u32 cookie;
  379. dest_dst = ip_vs_dest_dst_alloc();
  380. spin_lock_bh(&dest->dst_lock);
  381. if (!dest_dst) {
  382. __ip_vs_dst_set(dest, NULL, NULL, 0);
  383. spin_unlock_bh(&dest->dst_lock);
  384. goto err_unreach;
  385. }
  386. dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
  387. &dest_dst->dst_saddr.in6,
  388. do_xfrm, rt_mode);
  389. if (!dst) {
  390. __ip_vs_dst_set(dest, NULL, NULL, 0);
  391. spin_unlock_bh(&dest->dst_lock);
  392. ip_vs_dest_dst_free(dest_dst);
  393. goto err_unreach;
  394. }
  395. rt = (struct rt6_info *) dst;
  396. cookie = rt6_get_cookie(rt);
  397. __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
  398. spin_unlock_bh(&dest->dst_lock);
  399. IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
  400. &dest->addr.in6, &dest_dst->dst_saddr.in6,
  401. atomic_read(&rt->dst.__refcnt));
  402. }
  403. if (ret_saddr)
  404. *ret_saddr = dest_dst->dst_saddr.in6;
  405. } else {
  406. noref = 0;
  407. dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm,
  408. rt_mode);
  409. if (!dst)
  410. goto err_unreach;
  411. rt = (struct rt6_info *) dst;
  412. }
  413. local = __ip_vs_is_local_route6(rt);
  414. if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
  415. local))) {
  416. IP_VS_DBG_RL("We are crossing local and non-local addresses"
  417. " daddr=%pI6\n", daddr);
  418. goto err_put;
  419. }
  420. if (unlikely(local)) {
  421. /* skb to local stack, preserve old route */
  422. if (!noref)
  423. dst_release(&rt->dst);
  424. return local;
  425. }
  426. /* MTU checking */
  427. if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL)))
  428. mtu = dst_mtu(&rt->dst);
  429. else {
  430. mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
  431. if (mtu < IPV6_MIN_MTU) {
  432. IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
  433. IPV6_MIN_MTU);
  434. goto err_put;
  435. }
  436. maybe_update_pmtu(skb_af, skb, mtu);
  437. }
  438. if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu))
  439. goto err_put;
  440. skb_dst_drop(skb);
  441. if (noref) {
  442. if (!local)
  443. skb_dst_set_noref(skb, &rt->dst);
  444. else
  445. skb_dst_set(skb, dst_clone(&rt->dst));
  446. } else
  447. skb_dst_set(skb, &rt->dst);
  448. return local;
  449. err_put:
  450. if (!noref)
  451. dst_release(&rt->dst);
  452. return -1;
  453. err_unreach:
  454. /* The ip6_link_failure function requires the dev field to be set
  455. * in order to get the net (further for the sake of fwmark
  456. * reflection).
  457. */
  458. if (!skb->dev)
  459. skb->dev = skb_dst(skb)->dev;
  460. dst_link_failure(skb);
  461. return -1;
  462. }
  463. #endif
  464. /* return NF_ACCEPT to allow forwarding or other NF_xxx on error */
  465. static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
  466. struct ip_vs_conn *cp)
  467. {
  468. int ret = NF_ACCEPT;
  469. skb->ipvs_property = 1;
  470. if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
  471. ret = ip_vs_confirm_conntrack(skb);
  472. if (ret == NF_ACCEPT) {
  473. nf_reset(skb);
  474. skb_forward_csum(skb);
  475. if (!skb->sk)
  476. skb_sender_cpu_clear(skb);
  477. }
  478. return ret;
  479. }
  480. /* In the event of a remote destination, it's possible that we would have
  481. * matches against an old socket (particularly a TIME-WAIT socket). This
  482. * causes havoc down the line (ip_local_out et. al. expect regular sockets
  483. * and invalid memory accesses will happen) so simply drop the association
  484. * in this case.
  485. */
  486. static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb)
  487. {
  488. /* If dev is set, the packet came from the LOCAL_IN callback and
  489. * not from a local TCP socket.
  490. */
  491. if (skb->dev)
  492. skb_orphan(skb);
  493. }
  494. /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
  495. static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
  496. struct ip_vs_conn *cp, int local)
  497. {
  498. int ret = NF_STOLEN;
  499. skb->ipvs_property = 1;
  500. if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
  501. ip_vs_notrack(skb);
  502. else
  503. ip_vs_update_conntrack(skb, cp, 1);
  504. /* Remove the early_demux association unless it's bound for the
  505. * exact same port and address on this host after translation.
  506. */
  507. if (!local || cp->vport != cp->dport ||
  508. !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr))
  509. ip_vs_drop_early_demux_sk(skb);
  510. if (!local) {
  511. skb_forward_csum(skb);
  512. if (!skb->sk)
  513. skb_sender_cpu_clear(skb);
  514. NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
  515. NULL, skb_dst(skb)->dev, dst_output);
  516. } else
  517. ret = NF_ACCEPT;
  518. return ret;
  519. }
  520. /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
  521. static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
  522. struct ip_vs_conn *cp, int local)
  523. {
  524. int ret = NF_STOLEN;
  525. skb->ipvs_property = 1;
  526. if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
  527. ip_vs_notrack(skb);
  528. if (!local) {
  529. ip_vs_drop_early_demux_sk(skb);
  530. skb_forward_csum(skb);
  531. if (!skb->sk)
  532. skb_sender_cpu_clear(skb);
  533. NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
  534. NULL, skb_dst(skb)->dev, dst_output);
  535. } else
  536. ret = NF_ACCEPT;
  537. return ret;
  538. }
  539. /*
  540. * NULL transmitter (do nothing except return NF_ACCEPT)
  541. */
  542. int
  543. ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
  544. struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
  545. {
  546. /* we do not touch skb and do not need pskb ptr */
  547. return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
  548. }
  549. /*
  550. * Bypass transmitter
  551. * Let packets bypass the destination when the destination is not
  552. * available, it may be only used in transparent cache cluster.
  553. */
  554. int
  555. ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
  556. struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
  557. {
  558. struct iphdr *iph = ip_hdr(skb);
  559. EnterFunction(10);
  560. rcu_read_lock();
  561. if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr,
  562. IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0)
  563. goto tx_error;
  564. ip_send_check(iph);
  565. /* Another hack: avoid icmp_send in ip_fragment */
  566. skb->ignore_df = 1;
  567. ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
  568. rcu_read_unlock();
  569. LeaveFunction(10);
  570. return NF_STOLEN;
  571. tx_error:
  572. kfree_skb(skb);
  573. rcu_read_unlock();
  574. LeaveFunction(10);
  575. return NF_STOLEN;
  576. }
  577. #ifdef CONFIG_IP_VS_IPV6
  578. int
  579. ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
  580. struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
  581. {
  582. struct ipv6hdr *iph = ipv6_hdr(skb);
  583. EnterFunction(10);
  584. rcu_read_lock();
  585. if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL,
  586. &iph->daddr, NULL,
  587. ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
  588. goto tx_error;
  589. /* Another hack: avoid icmp_send in ip_fragment */
  590. skb->ignore_df = 1;
  591. ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
  592. rcu_read_unlock();
  593. LeaveFunction(10);
  594. return NF_STOLEN;
  595. tx_error:
  596. kfree_skb(skb);
  597. rcu_read_unlock();
  598. LeaveFunction(10);
  599. return NF_STOLEN;
  600. }
  601. #endif
  602. /*
  603. * NAT transmitter (only for outside-to-inside nat forwarding)
  604. * Not used for related ICMP
  605. */
  606. int
  607. ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
  608. struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
  609. {
  610. struct rtable *rt; /* Route to the other host */
  611. int local, rc, was_input;
  612. EnterFunction(10);
  613. rcu_read_lock();
  614. /* check if it is a connection of no-client-port */
  615. if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
  616. __be16 _pt, *p;
  617. p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
  618. if (p == NULL)
  619. goto tx_error;
  620. ip_vs_conn_fill_cport(cp, *p);
  621. IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
  622. }
  623. was_input = rt_is_input_route(skb_rtable(skb));
  624. local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
  625. IP_VS_RT_MODE_LOCAL |
  626. IP_VS_RT_MODE_NON_LOCAL |
  627. IP_VS_RT_MODE_RDR, NULL, ipvsh);
  628. if (local < 0)
  629. goto tx_error;
  630. rt = skb_rtable(skb);
  631. /*
  632. * Avoid duplicate tuple in reply direction for NAT traffic
  633. * to local address when connection is sync-ed
  634. */
  635. #if IS_ENABLED(CONFIG_NF_CONNTRACK)
  636. if (cp->flags & IP_VS_CONN_F_SYNC && local) {
  637. enum ip_conntrack_info ctinfo;
  638. struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
  639. if (ct && !nf_ct_is_untracked(ct)) {
  640. IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off,
  641. "ip_vs_nat_xmit(): "
  642. "stopping DNAT to local address");
  643. goto tx_error;
  644. }
  645. }
  646. #endif
  647. /* From world but DNAT to loopback address? */
  648. if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
  649. IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off,
  650. "ip_vs_nat_xmit(): stopping DNAT to loopback "
  651. "address");
  652. goto tx_error;
  653. }
  654. /* copy-on-write the packet before mangling it */
  655. if (!skb_make_writable(skb, sizeof(struct iphdr)))
  656. goto tx_error;
  657. if (skb_cow(skb, rt->dst.dev->hard_header_len))
  658. goto tx_error;
  659. /* mangle the packet */
  660. if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
  661. goto tx_error;
  662. ip_hdr(skb)->daddr = cp->daddr.ip;
  663. ip_send_check(ip_hdr(skb));
  664. IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT");
  665. /* FIXME: when application helper enlarges the packet and the length
  666. is larger than the MTU of outgoing device, there will be still
  667. MTU problem. */
  668. /* Another hack: avoid icmp_send in ip_fragment */
  669. skb->ignore_df = 1;
  670. rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
  671. rcu_read_unlock();
  672. LeaveFunction(10);
  673. return rc;
  674. tx_error:
  675. kfree_skb(skb);
  676. rcu_read_unlock();
  677. LeaveFunction(10);
  678. return NF_STOLEN;
  679. }
  680. #ifdef CONFIG_IP_VS_IPV6
  681. int
  682. ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
  683. struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
  684. {
  685. struct rt6_info *rt; /* Route to the other host */
  686. int local, rc;
  687. EnterFunction(10);
  688. rcu_read_lock();
  689. /* check if it is a connection of no-client-port */
  690. if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
  691. __be16 _pt, *p;
  692. p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
  693. if (p == NULL)
  694. goto tx_error;
  695. ip_vs_conn_fill_cport(cp, *p);
  696. IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
  697. }
  698. local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
  699. &cp->daddr.in6,
  700. NULL, ipvsh, 0,
  701. IP_VS_RT_MODE_LOCAL |
  702. IP_VS_RT_MODE_NON_LOCAL |
  703. IP_VS_RT_MODE_RDR);
  704. if (local < 0)
  705. goto tx_error;
  706. rt = (struct rt6_info *) skb_dst(skb);
  707. /*
  708. * Avoid duplicate tuple in reply direction for NAT traffic
  709. * to local address when connection is sync-ed
  710. */
  711. #if IS_ENABLED(CONFIG_NF_CONNTRACK)
  712. if (cp->flags & IP_VS_CONN_F_SYNC && local) {
  713. enum ip_conntrack_info ctinfo;
  714. struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
  715. if (ct && !nf_ct_is_untracked(ct)) {
  716. IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off,
  717. "ip_vs_nat_xmit_v6(): "
  718. "stopping DNAT to local address");
  719. goto tx_error;
  720. }
  721. }
  722. #endif
  723. /* From world but DNAT to loopback address? */
  724. if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
  725. ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
  726. IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, ipvsh->off,
  727. "ip_vs_nat_xmit_v6(): "
  728. "stopping DNAT to loopback address");
  729. goto tx_error;
  730. }
  731. /* copy-on-write the packet before mangling it */
  732. if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
  733. goto tx_error;
  734. if (skb_cow(skb, rt->dst.dev->hard_header_len))
  735. goto tx_error;
  736. /* mangle the packet */
  737. if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
  738. goto tx_error;
  739. ipv6_hdr(skb)->daddr = cp->daddr.in6;
  740. IP_VS_DBG_PKT(10, AF_INET6, pp, skb, ipvsh->off, "After DNAT");
  741. /* FIXME: when application helper enlarges the packet and the length
  742. is larger than the MTU of outgoing device, there will be still
  743. MTU problem. */
  744. /* Another hack: avoid icmp_send in ip_fragment */
  745. skb->ignore_df = 1;
  746. rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
  747. rcu_read_unlock();
  748. LeaveFunction(10);
  749. return rc;
  750. tx_error:
  751. LeaveFunction(10);
  752. kfree_skb(skb);
  753. rcu_read_unlock();
  754. return NF_STOLEN;
  755. }
  756. #endif
  757. /* When forwarding a packet, we must ensure that we've got enough headroom
  758. * for the encapsulation packet in the skb. This also gives us an
  759. * opportunity to figure out what the payload_len, dsfield, ttl, and df
  760. * values should be, so that we won't need to look at the old ip header
  761. * again
  762. */
  763. static struct sk_buff *
  764. ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
  765. unsigned int max_headroom, __u8 *next_protocol,
  766. __u32 *payload_len, __u8 *dsfield, __u8 *ttl,
  767. __be16 *df)
  768. {
  769. struct sk_buff *new_skb = NULL;
  770. struct iphdr *old_iph = NULL;
  771. #ifdef CONFIG_IP_VS_IPV6
  772. struct ipv6hdr *old_ipv6h = NULL;
  773. #endif
  774. ip_vs_drop_early_demux_sk(skb);
  775. if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
  776. new_skb = skb_realloc_headroom(skb, max_headroom);
  777. if (!new_skb)
  778. goto error;
  779. if (skb->sk)
  780. skb_set_owner_w(new_skb, skb->sk);
  781. consume_skb(skb);
  782. skb = new_skb;
  783. }
  784. #ifdef CONFIG_IP_VS_IPV6
  785. if (skb_af == AF_INET6) {
  786. old_ipv6h = ipv6_hdr(skb);
  787. *next_protocol = IPPROTO_IPV6;
  788. if (payload_len)
  789. *payload_len =
  790. ntohs(old_ipv6h->payload_len) +
  791. sizeof(*old_ipv6h);
  792. *dsfield = ipv6_get_dsfield(old_ipv6h);
  793. *ttl = old_ipv6h->hop_limit;
  794. if (df)
  795. *df = 0;
  796. } else
  797. #endif
  798. {
  799. old_iph = ip_hdr(skb);
  800. /* Copy DF, reset fragment offset and MF */
  801. if (df)
  802. *df = (old_iph->frag_off & htons(IP_DF));
  803. *next_protocol = IPPROTO_IPIP;
  804. /* fix old IP header checksum */
  805. ip_send_check(old_iph);
  806. *dsfield = ipv4_get_dsfield(old_iph);
  807. *ttl = old_iph->ttl;
  808. if (payload_len)
  809. *payload_len = ntohs(old_iph->tot_len);
  810. }
  811. return skb;
  812. error:
  813. kfree_skb(skb);
  814. return ERR_PTR(-ENOMEM);
  815. }
  816. static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
  817. {
  818. if (encaps_af == AF_INET) {
  819. if (orig_af == AF_INET)
  820. return SKB_GSO_IPIP;
  821. return SKB_GSO_SIT;
  822. }
  823. /* GSO: we need to provide proper SKB_GSO_ value for IPv6:
  824. * SKB_GSO_SIT/IPV6
  825. */
  826. return 0;
  827. }
  828. /*
  829. * IP Tunneling transmitter
  830. *
  831. * This function encapsulates the packet in a new IP packet, its
  832. * destination will be set to cp->daddr. Most code of this function
  833. * is taken from ipip.c.
  834. *
  835. * It is used in VS/TUN cluster. The load balancer selects a real
  836. * server from a cluster based on a scheduling algorithm,
  837. * encapsulates the request packet and forwards it to the selected
  838. * server. For example, all real servers are configured with
  839. * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
  840. * the encapsulated packet, it will decapsulate the packet, processe
  841. * the request and return the response packets directly to the client
  842. * without passing the load balancer. This can greatly increase the
  843. * scalability of virtual server.
  844. *
  845. * Used for ANY protocol
  846. */
  847. int
  848. ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
  849. struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
  850. {
  851. struct netns_ipvs *ipvs = cp->ipvs;
  852. struct net *net = ipvs->net;
  853. struct rtable *rt; /* Route to the other host */
  854. __be32 saddr; /* Source for tunnel */
  855. struct net_device *tdev; /* Device to other host */
  856. __u8 next_protocol = 0;
  857. __u8 dsfield = 0;
  858. __u8 ttl = 0;
  859. __be16 df = 0;
  860. __be16 *dfp = NULL;
  861. struct iphdr *iph; /* Our new IP header */
  862. unsigned int max_headroom; /* The extra header space needed */
  863. int ret, local;
  864. EnterFunction(10);
  865. rcu_read_lock();
  866. local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
  867. IP_VS_RT_MODE_LOCAL |
  868. IP_VS_RT_MODE_NON_LOCAL |
  869. IP_VS_RT_MODE_CONNECT |
  870. IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh);
  871. if (local < 0)
  872. goto tx_error;
  873. if (local) {
  874. rcu_read_unlock();
  875. return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
  876. }
  877. rt = skb_rtable(skb);
  878. tdev = rt->dst.dev;
  879. /*
  880. * Okay, now see if we can stuff it in the buffer as-is.
  881. */
  882. max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
  883. /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
  884. dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
  885. skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
  886. &next_protocol, NULL, &dsfield,
  887. &ttl, dfp);
  888. if (IS_ERR(skb))
  889. goto tx_error;
  890. skb = iptunnel_handle_offloads(
  891. skb, false, __tun_gso_type_mask(AF_INET, cp->af));
  892. if (IS_ERR(skb))
  893. goto tx_error;
  894. skb->transport_header = skb->network_header;
  895. skb_push(skb, sizeof(struct iphdr));
  896. skb_reset_network_header(skb);
  897. memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
  898. /*
  899. * Push down and install the IPIP header.
  900. */
  901. iph = ip_hdr(skb);
  902. iph->version = 4;
  903. iph->ihl = sizeof(struct iphdr)>>2;
  904. iph->frag_off = df;
  905. iph->protocol = next_protocol;
  906. iph->tos = dsfield;
  907. iph->daddr = cp->daddr.ip;
  908. iph->saddr = saddr;
  909. iph->ttl = ttl;
  910. ip_select_ident(net, skb, NULL);
  911. /* Another hack: avoid icmp_send in ip_fragment */
  912. skb->ignore_df = 1;
  913. ret = ip_vs_tunnel_xmit_prepare(skb, cp);
  914. if (ret == NF_ACCEPT)
  915. ip_local_out(net, skb->sk, skb);
  916. else if (ret == NF_DROP)
  917. kfree_skb(skb);
  918. rcu_read_unlock();
  919. LeaveFunction(10);
  920. return NF_STOLEN;
  921. tx_error:
  922. if (!IS_ERR(skb))
  923. kfree_skb(skb);
  924. rcu_read_unlock();
  925. LeaveFunction(10);
  926. return NF_STOLEN;
  927. }
  928. #ifdef CONFIG_IP_VS_IPV6
  929. int
  930. ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
  931. struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
  932. {
  933. struct rt6_info *rt; /* Route to the other host */
  934. struct in6_addr saddr; /* Source for tunnel */
  935. struct net_device *tdev; /* Device to other host */
  936. __u8 next_protocol = 0;
  937. __u32 payload_len = 0;
  938. __u8 dsfield = 0;
  939. __u8 ttl = 0;
  940. struct ipv6hdr *iph; /* Our new IP header */
  941. unsigned int max_headroom; /* The extra header space needed */
  942. int ret, local;
  943. EnterFunction(10);
  944. rcu_read_lock();
  945. local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
  946. &cp->daddr.in6,
  947. &saddr, ipvsh, 1,
  948. IP_VS_RT_MODE_LOCAL |
  949. IP_VS_RT_MODE_NON_LOCAL |
  950. IP_VS_RT_MODE_TUNNEL);
  951. if (local < 0)
  952. goto tx_error;
  953. if (local) {
  954. rcu_read_unlock();
  955. return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
  956. }
  957. rt = (struct rt6_info *) skb_dst(skb);
  958. tdev = rt->dst.dev;
  959. /*
  960. * Okay, now see if we can stuff it in the buffer as-is.
  961. */
  962. max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
  963. skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
  964. &next_protocol, &payload_len,
  965. &dsfield, &ttl, NULL);
  966. if (IS_ERR(skb))
  967. goto tx_error;
  968. skb = iptunnel_handle_offloads(
  969. skb, false, __tun_gso_type_mask(AF_INET6, cp->af));
  970. if (IS_ERR(skb))
  971. goto tx_error;
  972. skb->transport_header = skb->network_header;
  973. skb_push(skb, sizeof(struct ipv6hdr));
  974. skb_reset_network_header(skb);
  975. memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
  976. /*
  977. * Push down and install the IPIP header.
  978. */
  979. iph = ipv6_hdr(skb);
  980. iph->version = 6;
  981. iph->nexthdr = next_protocol;
  982. iph->payload_len = htons(payload_len);
  983. memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
  984. ipv6_change_dsfield(iph, 0, dsfield);
  985. iph->daddr = cp->daddr.in6;
  986. iph->saddr = saddr;
  987. iph->hop_limit = ttl;
  988. /* Another hack: avoid icmp_send in ip_fragment */
  989. skb->ignore_df = 1;
  990. ret = ip_vs_tunnel_xmit_prepare(skb, cp);
  991. if (ret == NF_ACCEPT)
  992. ip6_local_out(cp->ipvs->net, skb->sk, skb);
  993. else if (ret == NF_DROP)
  994. kfree_skb(skb);
  995. rcu_read_unlock();
  996. LeaveFunction(10);
  997. return NF_STOLEN;
  998. tx_error:
  999. if (!IS_ERR(skb))
  1000. kfree_skb(skb);
  1001. rcu_read_unlock();
  1002. LeaveFunction(10);
  1003. return NF_STOLEN;
  1004. }
  1005. #endif
  1006. /*
  1007. * Direct Routing transmitter
  1008. * Used for ANY protocol
  1009. */
  1010. int
  1011. ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
  1012. struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
  1013. {
  1014. int local;
  1015. EnterFunction(10);
  1016. rcu_read_lock();
  1017. local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
  1018. IP_VS_RT_MODE_LOCAL |
  1019. IP_VS_RT_MODE_NON_LOCAL |
  1020. IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh);
  1021. if (local < 0)
  1022. goto tx_error;
  1023. if (local) {
  1024. rcu_read_unlock();
  1025. return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
  1026. }
  1027. ip_send_check(ip_hdr(skb));
  1028. /* Another hack: avoid icmp_send in ip_fragment */
  1029. skb->ignore_df = 1;
  1030. ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
  1031. rcu_read_unlock();
  1032. LeaveFunction(10);
  1033. return NF_STOLEN;
  1034. tx_error:
  1035. kfree_skb(skb);
  1036. rcu_read_unlock();
  1037. LeaveFunction(10);
  1038. return NF_STOLEN;
  1039. }
  1040. #ifdef CONFIG_IP_VS_IPV6
  1041. int
  1042. ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
  1043. struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
  1044. {
  1045. int local;
  1046. EnterFunction(10);
  1047. rcu_read_lock();
  1048. local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
  1049. &cp->daddr.in6,
  1050. NULL, ipvsh, 0,
  1051. IP_VS_RT_MODE_LOCAL |
  1052. IP_VS_RT_MODE_NON_LOCAL |
  1053. IP_VS_RT_MODE_KNOWN_NH);
  1054. if (local < 0)
  1055. goto tx_error;
  1056. if (local) {
  1057. rcu_read_unlock();
  1058. return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
  1059. }
  1060. /* Another hack: avoid icmp_send in ip_fragment */
  1061. skb->ignore_df = 1;
  1062. ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
  1063. rcu_read_unlock();
  1064. LeaveFunction(10);
  1065. return NF_STOLEN;
  1066. tx_error:
  1067. kfree_skb(skb);
  1068. rcu_read_unlock();
  1069. LeaveFunction(10);
  1070. return NF_STOLEN;
  1071. }
  1072. #endif
  1073. /*
  1074. * ICMP packet transmitter
  1075. * called by the ip_vs_in_icmp
  1076. */
  1077. int
  1078. ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
  1079. struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
  1080. struct ip_vs_iphdr *iph)
  1081. {
  1082. struct rtable *rt; /* Route to the other host */
  1083. int rc;
  1084. int local;
  1085. int rt_mode, was_input;
  1086. EnterFunction(10);
  1087. /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
  1088. forwarded directly here, because there is no need to
  1089. translate address/port back */
  1090. if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
  1091. if (cp->packet_xmit)
  1092. rc = cp->packet_xmit(skb, cp, pp, iph);
  1093. else
  1094. rc = NF_ACCEPT;
  1095. /* do not touch skb anymore */
  1096. atomic_inc(&cp->in_pkts);
  1097. goto out;
  1098. }
  1099. /*
  1100. * mangle and send the packet here (only for VS/NAT)
  1101. */
  1102. was_input = rt_is_input_route(skb_rtable(skb));
  1103. /* LOCALNODE from FORWARD hook is not supported */
  1104. rt_mode = (hooknum != NF_INET_FORWARD) ?
  1105. IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
  1106. IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
  1107. rcu_read_lock();
  1108. local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode,
  1109. NULL, iph);
  1110. if (local < 0)
  1111. goto tx_error;
  1112. rt = skb_rtable(skb);
  1113. /*
  1114. * Avoid duplicate tuple in reply direction for NAT traffic
  1115. * to local address when connection is sync-ed
  1116. */
  1117. #if IS_ENABLED(CONFIG_NF_CONNTRACK)
  1118. if (cp->flags & IP_VS_CONN_F_SYNC && local) {
  1119. enum ip_conntrack_info ctinfo;
  1120. struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
  1121. if (ct && !nf_ct_is_untracked(ct)) {
  1122. IP_VS_DBG(10, "%s(): "
  1123. "stopping DNAT to local address %pI4\n",
  1124. __func__, &cp->daddr.ip);
  1125. goto tx_error;
  1126. }
  1127. }
  1128. #endif
  1129. /* From world but DNAT to loopback address? */
  1130. if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
  1131. IP_VS_DBG(1, "%s(): "
  1132. "stopping DNAT to loopback %pI4\n",
  1133. __func__, &cp->daddr.ip);
  1134. goto tx_error;
  1135. }
  1136. /* copy-on-write the packet before mangling it */
  1137. if (!skb_make_writable(skb, offset))
  1138. goto tx_error;
  1139. if (skb_cow(skb, rt->dst.dev->hard_header_len))
  1140. goto tx_error;
  1141. ip_vs_nat_icmp(skb, pp, cp, 0);
  1142. /* Another hack: avoid icmp_send in ip_fragment */
  1143. skb->ignore_df = 1;
  1144. rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
  1145. rcu_read_unlock();
  1146. goto out;
  1147. tx_error:
  1148. kfree_skb(skb);
  1149. rcu_read_unlock();
  1150. rc = NF_STOLEN;
  1151. out:
  1152. LeaveFunction(10);
  1153. return rc;
  1154. }
  1155. #ifdef CONFIG_IP_VS_IPV6
  1156. int
  1157. ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
  1158. struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
  1159. struct ip_vs_iphdr *ipvsh)
  1160. {
  1161. struct rt6_info *rt; /* Route to the other host */
  1162. int rc;
  1163. int local;
  1164. int rt_mode;
  1165. EnterFunction(10);
  1166. /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
  1167. forwarded directly here, because there is no need to
  1168. translate address/port back */
  1169. if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
  1170. if (cp->packet_xmit)
  1171. rc = cp->packet_xmit(skb, cp, pp, ipvsh);
  1172. else
  1173. rc = NF_ACCEPT;
  1174. /* do not touch skb anymore */
  1175. atomic_inc(&cp->in_pkts);
  1176. goto out;
  1177. }
  1178. /*
  1179. * mangle and send the packet here (only for VS/NAT)
  1180. */
  1181. /* LOCALNODE from FORWARD hook is not supported */
  1182. rt_mode = (hooknum != NF_INET_FORWARD) ?
  1183. IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
  1184. IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
  1185. rcu_read_lock();
  1186. local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
  1187. &cp->daddr.in6, NULL, ipvsh, 0, rt_mode);
  1188. if (local < 0)
  1189. goto tx_error;
  1190. rt = (struct rt6_info *) skb_dst(skb);
  1191. /*
  1192. * Avoid duplicate tuple in reply direction for NAT traffic
  1193. * to local address when connection is sync-ed
  1194. */
  1195. #if IS_ENABLED(CONFIG_NF_CONNTRACK)
  1196. if (cp->flags & IP_VS_CONN_F_SYNC && local) {
  1197. enum ip_conntrack_info ctinfo;
  1198. struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
  1199. if (ct && !nf_ct_is_untracked(ct)) {
  1200. IP_VS_DBG(10, "%s(): "
  1201. "stopping DNAT to local address %pI6\n",
  1202. __func__, &cp->daddr.in6);
  1203. goto tx_error;
  1204. }
  1205. }
  1206. #endif
  1207. /* From world but DNAT to loopback address? */
  1208. if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
  1209. ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
  1210. IP_VS_DBG(1, "%s(): "
  1211. "stopping DNAT to loopback %pI6\n",
  1212. __func__, &cp->daddr.in6);
  1213. goto tx_error;
  1214. }
  1215. /* copy-on-write the packet before mangling it */
  1216. if (!skb_make_writable(skb, offset))
  1217. goto tx_error;
  1218. if (skb_cow(skb, rt->dst.dev->hard_header_len))
  1219. goto tx_error;
  1220. ip_vs_nat_icmp_v6(skb, pp, cp, 0);
  1221. /* Another hack: avoid icmp_send in ip_fragment */
  1222. skb->ignore_df = 1;
  1223. rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
  1224. rcu_read_unlock();
  1225. goto out;
  1226. tx_error:
  1227. kfree_skb(skb);
  1228. rcu_read_unlock();
  1229. rc = NF_STOLEN;
  1230. out:
  1231. LeaveFunction(10);
  1232. return rc;
  1233. }
  1234. #endif