nf_nat_core.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905
  1. /*
  2. * (C) 1999-2001 Paul `Rusty' Russell
  3. * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
  4. * (C) 2011 Patrick McHardy <kaber@trash.net>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License version 2 as
  8. * published by the Free Software Foundation.
  9. */
  10. #include <linux/module.h>
  11. #include <linux/types.h>
  12. #include <linux/timer.h>
  13. #include <linux/skbuff.h>
  14. #include <linux/gfp.h>
  15. #include <net/xfrm.h>
  16. #include <linux/jhash.h>
  17. #include <linux/rtnetlink.h>
  18. #include <net/netfilter/nf_conntrack.h>
  19. #include <net/netfilter/nf_conntrack_core.h>
  20. #include <net/netfilter/nf_nat.h>
  21. #include <net/netfilter/nf_nat_l3proto.h>
  22. #include <net/netfilter/nf_nat_l4proto.h>
  23. #include <net/netfilter/nf_nat_core.h>
  24. #include <net/netfilter/nf_nat_helper.h>
  25. #include <net/netfilter/nf_conntrack_helper.h>
  26. #include <net/netfilter/nf_conntrack_seqadj.h>
  27. #include <net/netfilter/nf_conntrack_l3proto.h>
  28. #include <net/netfilter/nf_conntrack_zones.h>
  29. #include <linux/netfilter/nf_nat.h>
  30. static DEFINE_SPINLOCK(nf_nat_lock);
  31. static DEFINE_MUTEX(nf_nat_proto_mutex);
  32. static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
  33. __read_mostly;
  34. static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
  35. __read_mostly;
  36. inline const struct nf_nat_l3proto *
  37. __nf_nat_l3proto_find(u8 family)
  38. {
  39. return rcu_dereference(nf_nat_l3protos[family]);
  40. }
  41. inline const struct nf_nat_l4proto *
  42. __nf_nat_l4proto_find(u8 family, u8 protonum)
  43. {
  44. return rcu_dereference(nf_nat_l4protos[family][protonum]);
  45. }
  46. EXPORT_SYMBOL_GPL(__nf_nat_l4proto_find);
  47. #ifdef CONFIG_XFRM
  48. static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl)
  49. {
  50. const struct nf_nat_l3proto *l3proto;
  51. const struct nf_conn *ct;
  52. enum ip_conntrack_info ctinfo;
  53. enum ip_conntrack_dir dir;
  54. unsigned long statusbit;
  55. u8 family;
  56. ct = nf_ct_get(skb, &ctinfo);
  57. if (ct == NULL)
  58. return;
  59. family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
  60. rcu_read_lock();
  61. l3proto = __nf_nat_l3proto_find(family);
  62. if (l3proto == NULL)
  63. goto out;
  64. dir = CTINFO2DIR(ctinfo);
  65. if (dir == IP_CT_DIR_ORIGINAL)
  66. statusbit = IPS_DST_NAT;
  67. else
  68. statusbit = IPS_SRC_NAT;
  69. l3proto->decode_session(skb, ct, dir, statusbit, fl);
  70. out:
  71. rcu_read_unlock();
  72. }
  73. int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
  74. {
  75. struct flowi fl;
  76. unsigned int hh_len;
  77. struct dst_entry *dst;
  78. int err;
  79. err = xfrm_decode_session(skb, &fl, family);
  80. if (err < 0)
  81. return err;
  82. dst = skb_dst(skb);
  83. if (dst->xfrm)
  84. dst = ((struct xfrm_dst *)dst)->route;
  85. dst_hold(dst);
  86. dst = xfrm_lookup(net, dst, &fl, skb->sk, 0);
  87. if (IS_ERR(dst))
  88. return PTR_ERR(dst);
  89. skb_dst_drop(skb);
  90. skb_dst_set(skb, dst);
  91. /* Change in oif may mean change in hh_len. */
  92. hh_len = skb_dst(skb)->dev->hard_header_len;
  93. if (skb_headroom(skb) < hh_len &&
  94. pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
  95. return -ENOMEM;
  96. return 0;
  97. }
  98. EXPORT_SYMBOL(nf_xfrm_me_harder);
  99. #endif /* CONFIG_XFRM */
  100. /* We keep an extra hash for each conntrack, for fast searching. */
  101. static inline unsigned int
  102. hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple)
  103. {
  104. unsigned int hash;
  105. /* Original src, to ensure we map it consistently if poss. */
  106. hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
  107. tuple->dst.protonum ^ nf_conntrack_hash_rnd);
  108. return reciprocal_scale(hash, net->ct.nat_htable_size);
  109. }
  110. /* Is this tuple already taken? (not by us) */
  111. int
  112. nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
  113. const struct nf_conn *ignored_conntrack)
  114. {
  115. /* Conntrack tracking doesn't keep track of outgoing tuples; only
  116. * incoming ones. NAT means they don't have a fixed mapping,
  117. * so we invert the tuple and look for the incoming reply.
  118. *
  119. * We could keep a separate hash if this proves too slow.
  120. */
  121. struct nf_conntrack_tuple reply;
  122. nf_ct_invert_tuplepr(&reply, tuple);
  123. return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
  124. }
  125. EXPORT_SYMBOL(nf_nat_used_tuple);
  126. /* If we source map this tuple so reply looks like reply_tuple, will
  127. * that meet the constraints of range.
  128. */
  129. static int in_range(const struct nf_nat_l3proto *l3proto,
  130. const struct nf_nat_l4proto *l4proto,
  131. const struct nf_conntrack_tuple *tuple,
  132. const struct nf_nat_range *range)
  133. {
  134. /* If we are supposed to map IPs, then we must be in the
  135. * range specified, otherwise let this drag us onto a new src IP.
  136. */
  137. if (range->flags & NF_NAT_RANGE_MAP_IPS &&
  138. !l3proto->in_range(tuple, range))
  139. return 0;
  140. if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) ||
  141. l4proto->in_range(tuple, NF_NAT_MANIP_SRC,
  142. &range->min_proto, &range->max_proto))
  143. return 1;
  144. return 0;
  145. }
  146. static inline int
  147. same_src(const struct nf_conn *ct,
  148. const struct nf_conntrack_tuple *tuple)
  149. {
  150. const struct nf_conntrack_tuple *t;
  151. t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
  152. return (t->dst.protonum == tuple->dst.protonum &&
  153. nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) &&
  154. t->src.u.all == tuple->src.u.all);
  155. }
  156. /* Only called for SRC manip */
  157. static int
  158. find_appropriate_src(struct net *net,
  159. const struct nf_conntrack_zone *zone,
  160. const struct nf_nat_l3proto *l3proto,
  161. const struct nf_nat_l4proto *l4proto,
  162. const struct nf_conntrack_tuple *tuple,
  163. struct nf_conntrack_tuple *result,
  164. const struct nf_nat_range *range)
  165. {
  166. unsigned int h = hash_by_src(net, tuple);
  167. const struct nf_conn_nat *nat;
  168. const struct nf_conn *ct;
  169. hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) {
  170. ct = nat->ct;
  171. if (same_src(ct, tuple) &&
  172. nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
  173. /* Copy source part from reply tuple. */
  174. nf_ct_invert_tuplepr(result,
  175. &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
  176. result->dst = tuple->dst;
  177. if (in_range(l3proto, l4proto, result, range))
  178. return 1;
  179. }
  180. }
  181. return 0;
  182. }
  183. /* For [FUTURE] fragmentation handling, we want the least-used
  184. * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
  185. * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
  186. * 1-65535, we don't do pro-rata allocation based on ports; we choose
  187. * the ip with the lowest src-ip/dst-ip/proto usage.
  188. */
  189. static void
  190. find_best_ips_proto(const struct nf_conntrack_zone *zone,
  191. struct nf_conntrack_tuple *tuple,
  192. const struct nf_nat_range *range,
  193. const struct nf_conn *ct,
  194. enum nf_nat_manip_type maniptype)
  195. {
  196. union nf_inet_addr *var_ipp;
  197. unsigned int i, max;
  198. /* Host order */
  199. u32 minip, maxip, j, dist;
  200. bool full_range;
  201. /* No IP mapping? Do nothing. */
  202. if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
  203. return;
  204. if (maniptype == NF_NAT_MANIP_SRC)
  205. var_ipp = &tuple->src.u3;
  206. else
  207. var_ipp = &tuple->dst.u3;
  208. /* Fast path: only one choice. */
  209. if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) {
  210. *var_ipp = range->min_addr;
  211. return;
  212. }
  213. if (nf_ct_l3num(ct) == NFPROTO_IPV4)
  214. max = sizeof(var_ipp->ip) / sizeof(u32) - 1;
  215. else
  216. max = sizeof(var_ipp->ip6) / sizeof(u32) - 1;
  217. /* Hashing source and destination IPs gives a fairly even
  218. * spread in practice (if there are a small number of IPs
  219. * involved, there usually aren't that many connections
  220. * anyway). The consistency means that servers see the same
  221. * client coming from the same IP (some Internet Banking sites
  222. * like this), even across reboots.
  223. */
  224. j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32),
  225. range->flags & NF_NAT_RANGE_PERSISTENT ?
  226. 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id);
  227. full_range = false;
  228. for (i = 0; i <= max; i++) {
  229. /* If first bytes of the address are at the maximum, use the
  230. * distance. Otherwise use the full range.
  231. */
  232. if (!full_range) {
  233. minip = ntohl((__force __be32)range->min_addr.all[i]);
  234. maxip = ntohl((__force __be32)range->max_addr.all[i]);
  235. dist = maxip - minip + 1;
  236. } else {
  237. minip = 0;
  238. dist = ~0;
  239. }
  240. var_ipp->all[i] = (__force __u32)
  241. htonl(minip + reciprocal_scale(j, dist));
  242. if (var_ipp->all[i] != range->max_addr.all[i])
  243. full_range = true;
  244. if (!(range->flags & NF_NAT_RANGE_PERSISTENT))
  245. j ^= (__force u32)tuple->dst.u3.all[i];
  246. }
  247. }
  248. /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
  249. * we change the source to map into the range. For NF_INET_PRE_ROUTING
  250. * and NF_INET_LOCAL_OUT, we change the destination to map into the
  251. * range. It might not be possible to get a unique tuple, but we try.
  252. * At worst (or if we race), we will end up with a final duplicate in
  253. * __ip_conntrack_confirm and drop the packet. */
  254. static void
  255. get_unique_tuple(struct nf_conntrack_tuple *tuple,
  256. const struct nf_conntrack_tuple *orig_tuple,
  257. const struct nf_nat_range *range,
  258. struct nf_conn *ct,
  259. enum nf_nat_manip_type maniptype)
  260. {
  261. const struct nf_conntrack_zone *zone;
  262. const struct nf_nat_l3proto *l3proto;
  263. const struct nf_nat_l4proto *l4proto;
  264. struct net *net = nf_ct_net(ct);
  265. zone = nf_ct_zone(ct);
  266. rcu_read_lock();
  267. l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num);
  268. l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num,
  269. orig_tuple->dst.protonum);
  270. /* 1) If this srcip/proto/src-proto-part is currently mapped,
  271. * and that same mapping gives a unique tuple within the given
  272. * range, use that.
  273. *
  274. * This is only required for source (ie. NAT/masq) mappings.
  275. * So far, we don't do local source mappings, so multiple
  276. * manips not an issue.
  277. */
  278. if (maniptype == NF_NAT_MANIP_SRC &&
  279. !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
  280. /* try the original tuple first */
  281. if (in_range(l3proto, l4proto, orig_tuple, range)) {
  282. if (!nf_nat_used_tuple(orig_tuple, ct)) {
  283. *tuple = *orig_tuple;
  284. goto out;
  285. }
  286. } else if (find_appropriate_src(net, zone, l3proto, l4proto,
  287. orig_tuple, tuple, range)) {
  288. pr_debug("get_unique_tuple: Found current src map\n");
  289. if (!nf_nat_used_tuple(tuple, ct))
  290. goto out;
  291. }
  292. }
  293. /* 2) Select the least-used IP/proto combination in the given range */
  294. *tuple = *orig_tuple;
  295. find_best_ips_proto(zone, tuple, range, ct, maniptype);
  296. /* 3) The per-protocol part of the manip is made to map into
  297. * the range to make a unique tuple.
  298. */
  299. /* Only bother mapping if it's not already in range and unique */
  300. if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
  301. if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
  302. if (l4proto->in_range(tuple, maniptype,
  303. &range->min_proto,
  304. &range->max_proto) &&
  305. (range->min_proto.all == range->max_proto.all ||
  306. !nf_nat_used_tuple(tuple, ct)))
  307. goto out;
  308. } else if (!nf_nat_used_tuple(tuple, ct)) {
  309. goto out;
  310. }
  311. }
  312. /* Last change: get protocol to try to obtain unique tuple. */
  313. l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct);
  314. out:
  315. rcu_read_unlock();
  316. }
  317. struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct)
  318. {
  319. struct nf_conn_nat *nat = nfct_nat(ct);
  320. if (nat)
  321. return nat;
  322. if (!nf_ct_is_confirmed(ct))
  323. nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
  324. return nat;
  325. }
  326. EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add);
  327. unsigned int
  328. nf_nat_setup_info(struct nf_conn *ct,
  329. const struct nf_nat_range *range,
  330. enum nf_nat_manip_type maniptype)
  331. {
  332. struct net *net = nf_ct_net(ct);
  333. struct nf_conntrack_tuple curr_tuple, new_tuple;
  334. struct nf_conn_nat *nat;
  335. /* nat helper or nfctnetlink also setup binding */
  336. nat = nf_ct_nat_ext_add(ct);
  337. if (nat == NULL)
  338. return NF_ACCEPT;
  339. NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC ||
  340. maniptype == NF_NAT_MANIP_DST);
  341. BUG_ON(nf_nat_initialized(ct, maniptype));
  342. /* What we've got will look like inverse of reply. Normally
  343. * this is what is in the conntrack, except for prior
  344. * manipulations (future optimization: if num_manips == 0,
  345. * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
  346. */
  347. nf_ct_invert_tuplepr(&curr_tuple,
  348. &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
  349. get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
  350. if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
  351. struct nf_conntrack_tuple reply;
  352. /* Alter conntrack table so will recognize replies. */
  353. nf_ct_invert_tuplepr(&reply, &new_tuple);
  354. nf_conntrack_alter_reply(ct, &reply);
  355. /* Non-atomic: we own this at the moment. */
  356. if (maniptype == NF_NAT_MANIP_SRC)
  357. ct->status |= IPS_SRC_NAT;
  358. else
  359. ct->status |= IPS_DST_NAT;
  360. if (nfct_help(ct))
  361. nfct_seqadj_ext_add(ct);
  362. }
  363. if (maniptype == NF_NAT_MANIP_SRC) {
  364. unsigned int srchash;
  365. srchash = hash_by_src(net,
  366. &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
  367. spin_lock_bh(&nf_nat_lock);
  368. /* nf_conntrack_alter_reply might re-allocate extension aera */
  369. nat = nfct_nat(ct);
  370. nat->ct = ct;
  371. hlist_add_head_rcu(&nat->bysource,
  372. &net->ct.nat_bysource[srchash]);
  373. spin_unlock_bh(&nf_nat_lock);
  374. }
  375. /* It's done. */
  376. if (maniptype == NF_NAT_MANIP_DST)
  377. ct->status |= IPS_DST_NAT_DONE;
  378. else
  379. ct->status |= IPS_SRC_NAT_DONE;
  380. return NF_ACCEPT;
  381. }
  382. EXPORT_SYMBOL(nf_nat_setup_info);
  383. static unsigned int
  384. __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip)
  385. {
  386. /* Force range to this IP; let proto decide mapping for
  387. * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
  388. * Use reply in case it's already been mangled (eg local packet).
  389. */
  390. union nf_inet_addr ip =
  391. (manip == NF_NAT_MANIP_SRC ?
  392. ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 :
  393. ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3);
  394. struct nf_nat_range range = {
  395. .flags = NF_NAT_RANGE_MAP_IPS,
  396. .min_addr = ip,
  397. .max_addr = ip,
  398. };
  399. return nf_nat_setup_info(ct, &range, manip);
  400. }
  401. unsigned int
  402. nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
  403. {
  404. return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum));
  405. }
  406. EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding);
  407. /* Do packet manipulations according to nf_nat_setup_info. */
  408. unsigned int nf_nat_packet(struct nf_conn *ct,
  409. enum ip_conntrack_info ctinfo,
  410. unsigned int hooknum,
  411. struct sk_buff *skb)
  412. {
  413. const struct nf_nat_l3proto *l3proto;
  414. const struct nf_nat_l4proto *l4proto;
  415. enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
  416. unsigned long statusbit;
  417. enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
  418. if (mtype == NF_NAT_MANIP_SRC)
  419. statusbit = IPS_SRC_NAT;
  420. else
  421. statusbit = IPS_DST_NAT;
  422. /* Invert if this is reply dir. */
  423. if (dir == IP_CT_DIR_REPLY)
  424. statusbit ^= IPS_NAT_MASK;
  425. /* Non-atomic: these bits don't change. */
  426. if (ct->status & statusbit) {
  427. struct nf_conntrack_tuple target;
  428. /* We are aiming to look like inverse of other direction. */
  429. nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
  430. l3proto = __nf_nat_l3proto_find(target.src.l3num);
  431. l4proto = __nf_nat_l4proto_find(target.src.l3num,
  432. target.dst.protonum);
  433. if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype))
  434. return NF_DROP;
  435. }
  436. return NF_ACCEPT;
  437. }
  438. EXPORT_SYMBOL_GPL(nf_nat_packet);
  439. struct nf_nat_proto_clean {
  440. u8 l3proto;
  441. u8 l4proto;
  442. };
  443. /* kill conntracks with affected NAT section */
  444. static int nf_nat_proto_remove(struct nf_conn *i, void *data)
  445. {
  446. const struct nf_nat_proto_clean *clean = data;
  447. struct nf_conn_nat *nat = nfct_nat(i);
  448. if (!nat)
  449. return 0;
  450. if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) ||
  451. (clean->l4proto && nf_ct_protonum(i) != clean->l4proto))
  452. return 0;
  453. return i->status & IPS_NAT_MASK ? 1 : 0;
  454. }
  455. static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
  456. {
  457. struct nf_conn_nat *nat = nfct_nat(ct);
  458. if (nf_nat_proto_remove(ct, data))
  459. return 1;
  460. if (!nat || !nat->ct)
  461. return 0;
  462. /* This netns is being destroyed, and conntrack has nat null binding.
  463. * Remove it from bysource hash, as the table will be freed soon.
  464. *
  465. * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack()
  466. * will delete entry from already-freed table.
  467. */
  468. if (!del_timer(&ct->timeout))
  469. return 1;
  470. spin_lock_bh(&nf_nat_lock);
  471. hlist_del_rcu(&nat->bysource);
  472. ct->status &= ~IPS_NAT_DONE_MASK;
  473. nat->ct = NULL;
  474. spin_unlock_bh(&nf_nat_lock);
  475. add_timer(&ct->timeout);
  476. /* don't delete conntrack. Although that would make things a lot
  477. * simpler, we'd end up flushing all conntracks on nat rmmod.
  478. */
  479. return 0;
  480. }
  481. static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto)
  482. {
  483. struct nf_nat_proto_clean clean = {
  484. .l3proto = l3proto,
  485. .l4proto = l4proto,
  486. };
  487. struct net *net;
  488. rtnl_lock();
  489. for_each_net(net)
  490. nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0);
  491. rtnl_unlock();
  492. }
  493. static void nf_nat_l3proto_clean(u8 l3proto)
  494. {
  495. struct nf_nat_proto_clean clean = {
  496. .l3proto = l3proto,
  497. };
  498. struct net *net;
  499. rtnl_lock();
  500. for_each_net(net)
  501. nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0);
  502. rtnl_unlock();
  503. }
  504. /* Protocol registration. */
  505. int nf_nat_l4proto_register(u8 l3proto, const struct nf_nat_l4proto *l4proto)
  506. {
  507. const struct nf_nat_l4proto **l4protos;
  508. unsigned int i;
  509. int ret = 0;
  510. mutex_lock(&nf_nat_proto_mutex);
  511. if (nf_nat_l4protos[l3proto] == NULL) {
  512. l4protos = kmalloc(IPPROTO_MAX * sizeof(struct nf_nat_l4proto *),
  513. GFP_KERNEL);
  514. if (l4protos == NULL) {
  515. ret = -ENOMEM;
  516. goto out;
  517. }
  518. for (i = 0; i < IPPROTO_MAX; i++)
  519. RCU_INIT_POINTER(l4protos[i], &nf_nat_l4proto_unknown);
  520. /* Before making proto_array visible to lockless readers,
  521. * we must make sure its content is committed to memory.
  522. */
  523. smp_wmb();
  524. nf_nat_l4protos[l3proto] = l4protos;
  525. }
  526. if (rcu_dereference_protected(
  527. nf_nat_l4protos[l3proto][l4proto->l4proto],
  528. lockdep_is_held(&nf_nat_proto_mutex)
  529. ) != &nf_nat_l4proto_unknown) {
  530. ret = -EBUSY;
  531. goto out;
  532. }
  533. RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto], l4proto);
  534. out:
  535. mutex_unlock(&nf_nat_proto_mutex);
  536. return ret;
  537. }
  538. EXPORT_SYMBOL_GPL(nf_nat_l4proto_register);
  539. /* No one stores the protocol anywhere; simply delete it. */
  540. void nf_nat_l4proto_unregister(u8 l3proto, const struct nf_nat_l4proto *l4proto)
  541. {
  542. mutex_lock(&nf_nat_proto_mutex);
  543. RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto],
  544. &nf_nat_l4proto_unknown);
  545. mutex_unlock(&nf_nat_proto_mutex);
  546. synchronize_rcu();
  547. nf_nat_l4proto_clean(l3proto, l4proto->l4proto);
  548. }
  549. EXPORT_SYMBOL_GPL(nf_nat_l4proto_unregister);
  550. int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto)
  551. {
  552. int err;
  553. err = nf_ct_l3proto_try_module_get(l3proto->l3proto);
  554. if (err < 0)
  555. return err;
  556. mutex_lock(&nf_nat_proto_mutex);
  557. RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_TCP],
  558. &nf_nat_l4proto_tcp);
  559. RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDP],
  560. &nf_nat_l4proto_udp);
  561. mutex_unlock(&nf_nat_proto_mutex);
  562. RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto);
  563. return 0;
  564. }
  565. EXPORT_SYMBOL_GPL(nf_nat_l3proto_register);
  566. void nf_nat_l3proto_unregister(const struct nf_nat_l3proto *l3proto)
  567. {
  568. mutex_lock(&nf_nat_proto_mutex);
  569. RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], NULL);
  570. mutex_unlock(&nf_nat_proto_mutex);
  571. synchronize_rcu();
  572. nf_nat_l3proto_clean(l3proto->l3proto);
  573. nf_ct_l3proto_module_put(l3proto->l3proto);
  574. }
  575. EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister);
  576. /* No one using conntrack by the time this called. */
  577. static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
  578. {
  579. struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
  580. if (nat == NULL || nat->ct == NULL)
  581. return;
  582. NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
  583. spin_lock_bh(&nf_nat_lock);
  584. hlist_del_rcu(&nat->bysource);
  585. spin_unlock_bh(&nf_nat_lock);
  586. }
  587. static void nf_nat_move_storage(void *new, void *old)
  588. {
  589. struct nf_conn_nat *new_nat = new;
  590. struct nf_conn_nat *old_nat = old;
  591. struct nf_conn *ct = old_nat->ct;
  592. if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
  593. return;
  594. spin_lock_bh(&nf_nat_lock);
  595. hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
  596. spin_unlock_bh(&nf_nat_lock);
  597. }
  598. static struct nf_ct_ext_type nat_extend __read_mostly = {
  599. .len = sizeof(struct nf_conn_nat),
  600. .align = __alignof__(struct nf_conn_nat),
  601. .destroy = nf_nat_cleanup_conntrack,
  602. .move = nf_nat_move_storage,
  603. .id = NF_CT_EXT_NAT,
  604. .flags = NF_CT_EXT_F_PREALLOC,
  605. };
  606. #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
  607. #include <linux/netfilter/nfnetlink.h>
  608. #include <linux/netfilter/nfnetlink_conntrack.h>
  609. static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
  610. [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 },
  611. [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 },
  612. };
  613. static int nfnetlink_parse_nat_proto(struct nlattr *attr,
  614. const struct nf_conn *ct,
  615. struct nf_nat_range *range)
  616. {
  617. struct nlattr *tb[CTA_PROTONAT_MAX+1];
  618. const struct nf_nat_l4proto *l4proto;
  619. int err;
  620. err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy);
  621. if (err < 0)
  622. return err;
  623. l4proto = __nf_nat_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
  624. if (l4proto->nlattr_to_range)
  625. err = l4proto->nlattr_to_range(tb, range);
  626. return err;
  627. }
  628. static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
  629. [CTA_NAT_V4_MINIP] = { .type = NLA_U32 },
  630. [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 },
  631. [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) },
  632. [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) },
  633. [CTA_NAT_PROTO] = { .type = NLA_NESTED },
  634. };
  635. static int
  636. nfnetlink_parse_nat(const struct nlattr *nat,
  637. const struct nf_conn *ct, struct nf_nat_range *range,
  638. const struct nf_nat_l3proto *l3proto)
  639. {
  640. struct nlattr *tb[CTA_NAT_MAX+1];
  641. int err;
  642. memset(range, 0, sizeof(*range));
  643. err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy);
  644. if (err < 0)
  645. return err;
  646. err = l3proto->nlattr_to_range(tb, range);
  647. if (err < 0)
  648. return err;
  649. if (!tb[CTA_NAT_PROTO])
  650. return 0;
  651. return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range);
  652. }
  653. /* This function is called under rcu_read_lock() */
  654. static int
  655. nfnetlink_parse_nat_setup(struct nf_conn *ct,
  656. enum nf_nat_manip_type manip,
  657. const struct nlattr *attr)
  658. {
  659. struct nf_nat_range range;
  660. const struct nf_nat_l3proto *l3proto;
  661. int err;
  662. /* Should not happen, restricted to creating new conntracks
  663. * via ctnetlink.
  664. */
  665. if (WARN_ON_ONCE(nf_nat_initialized(ct, manip)))
  666. return -EEXIST;
  667. /* Make sure that L3 NAT is there by when we call nf_nat_setup_info to
  668. * attach the null binding, otherwise this may oops.
  669. */
  670. l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct));
  671. if (l3proto == NULL)
  672. return -EAGAIN;
  673. /* No NAT information has been passed, allocate the null-binding */
  674. if (attr == NULL)
  675. return __nf_nat_alloc_null_binding(ct, manip);
  676. err = nfnetlink_parse_nat(attr, ct, &range, l3proto);
  677. if (err < 0)
  678. return err;
  679. return nf_nat_setup_info(ct, &range, manip);
  680. }
  681. #else
  682. static int
  683. nfnetlink_parse_nat_setup(struct nf_conn *ct,
  684. enum nf_nat_manip_type manip,
  685. const struct nlattr *attr)
  686. {
  687. return -EOPNOTSUPP;
  688. }
  689. #endif
  690. static int __net_init nf_nat_net_init(struct net *net)
  691. {
  692. /* Leave them the same for the moment. */
  693. net->ct.nat_htable_size = net->ct.htable_size;
  694. net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0);
  695. if (!net->ct.nat_bysource)
  696. return -ENOMEM;
  697. return 0;
  698. }
  699. static void __net_exit nf_nat_net_exit(struct net *net)
  700. {
  701. struct nf_nat_proto_clean clean = {};
  702. nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0);
  703. synchronize_rcu();
  704. nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size);
  705. }
  706. static struct pernet_operations nf_nat_net_ops = {
  707. .init = nf_nat_net_init,
  708. .exit = nf_nat_net_exit,
  709. };
  710. static struct nf_ct_helper_expectfn follow_master_nat = {
  711. .name = "nat-follow-master",
  712. .expectfn = nf_nat_follow_master,
  713. };
  714. static int __init nf_nat_init(void)
  715. {
  716. int ret;
  717. ret = nf_ct_extend_register(&nat_extend);
  718. if (ret < 0) {
  719. printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
  720. return ret;
  721. }
  722. ret = register_pernet_subsys(&nf_nat_net_ops);
  723. if (ret < 0)
  724. goto cleanup_extend;
  725. nf_ct_helper_expectfn_register(&follow_master_nat);
  726. /* Initialize fake conntrack so that NAT will skip it */
  727. nf_ct_untracked_status_or(IPS_NAT_DONE_MASK);
  728. BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
  729. RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook,
  730. nfnetlink_parse_nat_setup);
  731. #ifdef CONFIG_XFRM
  732. BUG_ON(nf_nat_decode_session_hook != NULL);
  733. RCU_INIT_POINTER(nf_nat_decode_session_hook, __nf_nat_decode_session);
  734. #endif
  735. return 0;
  736. cleanup_extend:
  737. nf_ct_extend_unregister(&nat_extend);
  738. return ret;
  739. }
  740. static void __exit nf_nat_cleanup(void)
  741. {
  742. unsigned int i;
  743. unregister_pernet_subsys(&nf_nat_net_ops);
  744. nf_ct_extend_unregister(&nat_extend);
  745. nf_ct_helper_expectfn_unregister(&follow_master_nat);
  746. RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL);
  747. #ifdef CONFIG_XFRM
  748. RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL);
  749. #endif
  750. synchronize_rcu();
  751. for (i = 0; i < NFPROTO_NUMPROTO; i++)
  752. kfree(nf_nat_l4protos[i]);
  753. synchronize_net();
  754. }
  755. MODULE_LICENSE("GPL");
  756. module_init(nf_nat_init);
  757. module_exit(nf_nat_cleanup);