dm-btree.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975
  1. /*
  2. * Copyright (C) 2011 Red Hat, Inc.
  3. *
  4. * This file is released under the GPL.
  5. */
  6. #include "dm-btree-internal.h"
  7. #include "dm-space-map.h"
  8. #include "dm-transaction-manager.h"
  9. #include <linux/export.h>
  10. #include <linux/device-mapper.h>
  11. #define DM_MSG_PREFIX "btree"
  12. /*----------------------------------------------------------------
  13. * Array manipulation
  14. *--------------------------------------------------------------*/
  15. static void memcpy_disk(void *dest, const void *src, size_t len)
  16. __dm_written_to_disk(src)
  17. {
  18. memcpy(dest, src, len);
  19. __dm_unbless_for_disk(src);
  20. }
  21. static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
  22. unsigned index, void *elt)
  23. __dm_written_to_disk(elt)
  24. {
  25. if (index < nr_elts)
  26. memmove(base + (elt_size * (index + 1)),
  27. base + (elt_size * index),
  28. (nr_elts - index) * elt_size);
  29. memcpy_disk(base + (elt_size * index), elt, elt_size);
  30. }
  31. /*----------------------------------------------------------------*/
  32. /* makes the assumption that no two keys are the same. */
  33. static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
  34. {
  35. int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
  36. while (hi - lo > 1) {
  37. int mid = lo + ((hi - lo) / 2);
  38. uint64_t mid_key = le64_to_cpu(n->keys[mid]);
  39. if (mid_key == key)
  40. return mid;
  41. if (mid_key < key)
  42. lo = mid;
  43. else
  44. hi = mid;
  45. }
  46. return want_hi ? hi : lo;
  47. }
  48. int lower_bound(struct btree_node *n, uint64_t key)
  49. {
  50. return bsearch(n, key, 0);
  51. }
  52. static int upper_bound(struct btree_node *n, uint64_t key)
  53. {
  54. return bsearch(n, key, 1);
  55. }
  56. void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
  57. struct dm_btree_value_type *vt)
  58. {
  59. unsigned i;
  60. uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
  61. if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
  62. for (i = 0; i < nr_entries; i++)
  63. dm_tm_inc(tm, value64(n, i));
  64. else if (vt->inc)
  65. for (i = 0; i < nr_entries; i++)
  66. vt->inc(vt->context, value_ptr(n, i));
  67. }
  68. static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
  69. uint64_t key, void *value)
  70. __dm_written_to_disk(value)
  71. {
  72. uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
  73. __le64 key_le = cpu_to_le64(key);
  74. if (index > nr_entries ||
  75. index >= le32_to_cpu(node->header.max_entries)) {
  76. DMERR("too many entries in btree node for insert");
  77. __dm_unbless_for_disk(value);
  78. return -ENOMEM;
  79. }
  80. __dm_bless_for_disk(&key_le);
  81. array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
  82. array_insert(value_base(node), value_size, nr_entries, index, value);
  83. node->header.nr_entries = cpu_to_le32(nr_entries + 1);
  84. return 0;
  85. }
  86. /*----------------------------------------------------------------*/
  87. /*
  88. * We want 3n entries (for some n). This works more nicely for repeated
  89. * insert remove loops than (2n + 1).
  90. */
  91. static uint32_t calc_max_entries(size_t value_size, size_t block_size)
  92. {
  93. uint32_t total, n;
  94. size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
  95. block_size -= sizeof(struct node_header);
  96. total = block_size / elt_size;
  97. n = total / 3; /* rounds down */
  98. return 3 * n;
  99. }
  100. int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
  101. {
  102. int r;
  103. struct dm_block *b;
  104. struct btree_node *n;
  105. size_t block_size;
  106. uint32_t max_entries;
  107. r = new_block(info, &b);
  108. if (r < 0)
  109. return r;
  110. block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
  111. max_entries = calc_max_entries(info->value_type.size, block_size);
  112. n = dm_block_data(b);
  113. memset(n, 0, block_size);
  114. n->header.flags = cpu_to_le32(LEAF_NODE);
  115. n->header.nr_entries = cpu_to_le32(0);
  116. n->header.max_entries = cpu_to_le32(max_entries);
  117. n->header.value_size = cpu_to_le32(info->value_type.size);
  118. *root = dm_block_location(b);
  119. unlock_block(info, b);
  120. return 0;
  121. }
  122. EXPORT_SYMBOL_GPL(dm_btree_empty);
  123. /*----------------------------------------------------------------*/
  124. /*
  125. * Deletion uses a recursive algorithm, since we have limited stack space
  126. * we explicitly manage our own stack on the heap.
  127. */
  128. #define MAX_SPINE_DEPTH 64
  129. struct frame {
  130. struct dm_block *b;
  131. struct btree_node *n;
  132. unsigned level;
  133. unsigned nr_children;
  134. unsigned current_child;
  135. };
  136. struct del_stack {
  137. struct dm_btree_info *info;
  138. struct dm_transaction_manager *tm;
  139. int top;
  140. struct frame spine[MAX_SPINE_DEPTH];
  141. };
  142. static int top_frame(struct del_stack *s, struct frame **f)
  143. {
  144. if (s->top < 0) {
  145. DMERR("btree deletion stack empty");
  146. return -EINVAL;
  147. }
  148. *f = s->spine + s->top;
  149. return 0;
  150. }
  151. static int unprocessed_frames(struct del_stack *s)
  152. {
  153. return s->top >= 0;
  154. }
  155. static void prefetch_children(struct del_stack *s, struct frame *f)
  156. {
  157. unsigned i;
  158. struct dm_block_manager *bm = dm_tm_get_bm(s->tm);
  159. for (i = 0; i < f->nr_children; i++)
  160. dm_bm_prefetch(bm, value64(f->n, i));
  161. }
  162. static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
  163. {
  164. return f->level < (info->levels - 1);
  165. }
  166. static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
  167. {
  168. int r;
  169. uint32_t ref_count;
  170. if (s->top >= MAX_SPINE_DEPTH - 1) {
  171. DMERR("btree deletion stack out of memory");
  172. return -ENOMEM;
  173. }
  174. r = dm_tm_ref(s->tm, b, &ref_count);
  175. if (r)
  176. return r;
  177. if (ref_count > 1)
  178. /*
  179. * This is a shared node, so we can just decrement it's
  180. * reference counter and leave the children.
  181. */
  182. dm_tm_dec(s->tm, b);
  183. else {
  184. uint32_t flags;
  185. struct frame *f = s->spine + ++s->top;
  186. r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
  187. if (r) {
  188. s->top--;
  189. return r;
  190. }
  191. f->n = dm_block_data(f->b);
  192. f->level = level;
  193. f->nr_children = le32_to_cpu(f->n->header.nr_entries);
  194. f->current_child = 0;
  195. flags = le32_to_cpu(f->n->header.flags);
  196. if (flags & INTERNAL_NODE || is_internal_level(s->info, f))
  197. prefetch_children(s, f);
  198. }
  199. return 0;
  200. }
  201. static void pop_frame(struct del_stack *s)
  202. {
  203. struct frame *f = s->spine + s->top--;
  204. dm_tm_dec(s->tm, dm_block_location(f->b));
  205. dm_tm_unlock(s->tm, f->b);
  206. }
  207. static void unlock_all_frames(struct del_stack *s)
  208. {
  209. struct frame *f;
  210. while (unprocessed_frames(s)) {
  211. f = s->spine + s->top--;
  212. dm_tm_unlock(s->tm, f->b);
  213. }
  214. }
  215. int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
  216. {
  217. int r;
  218. struct del_stack *s;
  219. s = kmalloc(sizeof(*s), GFP_NOIO);
  220. if (!s)
  221. return -ENOMEM;
  222. s->info = info;
  223. s->tm = info->tm;
  224. s->top = -1;
  225. r = push_frame(s, root, 0);
  226. if (r)
  227. goto out;
  228. while (unprocessed_frames(s)) {
  229. uint32_t flags;
  230. struct frame *f;
  231. dm_block_t b;
  232. r = top_frame(s, &f);
  233. if (r)
  234. goto out;
  235. if (f->current_child >= f->nr_children) {
  236. pop_frame(s);
  237. continue;
  238. }
  239. flags = le32_to_cpu(f->n->header.flags);
  240. if (flags & INTERNAL_NODE) {
  241. b = value64(f->n, f->current_child);
  242. f->current_child++;
  243. r = push_frame(s, b, f->level);
  244. if (r)
  245. goto out;
  246. } else if (is_internal_level(info, f)) {
  247. b = value64(f->n, f->current_child);
  248. f->current_child++;
  249. r = push_frame(s, b, f->level + 1);
  250. if (r)
  251. goto out;
  252. } else {
  253. if (info->value_type.dec) {
  254. unsigned i;
  255. for (i = 0; i < f->nr_children; i++)
  256. info->value_type.dec(info->value_type.context,
  257. value_ptr(f->n, i));
  258. }
  259. pop_frame(s);
  260. }
  261. }
  262. out:
  263. if (r) {
  264. /* cleanup all frames of del_stack */
  265. unlock_all_frames(s);
  266. }
  267. kfree(s);
  268. return r;
  269. }
  270. EXPORT_SYMBOL_GPL(dm_btree_del);
  271. /*----------------------------------------------------------------*/
  272. static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
  273. int (*search_fn)(struct btree_node *, uint64_t),
  274. uint64_t *result_key, void *v, size_t value_size)
  275. {
  276. int i, r;
  277. uint32_t flags, nr_entries;
  278. do {
  279. r = ro_step(s, block);
  280. if (r < 0)
  281. return r;
  282. i = search_fn(ro_node(s), key);
  283. flags = le32_to_cpu(ro_node(s)->header.flags);
  284. nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
  285. if (i < 0 || i >= nr_entries)
  286. return -ENODATA;
  287. if (flags & INTERNAL_NODE)
  288. block = value64(ro_node(s), i);
  289. } while (!(flags & LEAF_NODE));
  290. *result_key = le64_to_cpu(ro_node(s)->keys[i]);
  291. memcpy(v, value_ptr(ro_node(s), i), value_size);
  292. return 0;
  293. }
  294. int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
  295. uint64_t *keys, void *value_le)
  296. {
  297. unsigned level, last_level = info->levels - 1;
  298. int r = -ENODATA;
  299. uint64_t rkey;
  300. __le64 internal_value_le;
  301. struct ro_spine spine;
  302. init_ro_spine(&spine, info);
  303. for (level = 0; level < info->levels; level++) {
  304. size_t size;
  305. void *value_p;
  306. if (level == last_level) {
  307. value_p = value_le;
  308. size = info->value_type.size;
  309. } else {
  310. value_p = &internal_value_le;
  311. size = sizeof(uint64_t);
  312. }
  313. r = btree_lookup_raw(&spine, root, keys[level],
  314. lower_bound, &rkey,
  315. value_p, size);
  316. if (!r) {
  317. if (rkey != keys[level]) {
  318. exit_ro_spine(&spine);
  319. return -ENODATA;
  320. }
  321. } else {
  322. exit_ro_spine(&spine);
  323. return r;
  324. }
  325. root = le64_to_cpu(internal_value_le);
  326. }
  327. exit_ro_spine(&spine);
  328. return r;
  329. }
  330. EXPORT_SYMBOL_GPL(dm_btree_lookup);
  331. static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root,
  332. uint64_t key, uint64_t *rkey, void *value_le)
  333. {
  334. int r, i;
  335. uint32_t flags, nr_entries;
  336. struct dm_block *node;
  337. struct btree_node *n;
  338. r = bn_read_lock(info, root, &node);
  339. if (r)
  340. return r;
  341. n = dm_block_data(node);
  342. flags = le32_to_cpu(n->header.flags);
  343. nr_entries = le32_to_cpu(n->header.nr_entries);
  344. if (flags & INTERNAL_NODE) {
  345. i = lower_bound(n, key);
  346. if (i < 0 || i >= nr_entries) {
  347. r = -ENODATA;
  348. goto out;
  349. }
  350. r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
  351. if (r == -ENODATA && i < (nr_entries - 1)) {
  352. i++;
  353. r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
  354. }
  355. } else {
  356. i = upper_bound(n, key);
  357. if (i < 0 || i >= nr_entries) {
  358. r = -ENODATA;
  359. goto out;
  360. }
  361. *rkey = le64_to_cpu(n->keys[i]);
  362. memcpy(value_le, value_ptr(n, i), info->value_type.size);
  363. }
  364. out:
  365. dm_tm_unlock(info->tm, node);
  366. return r;
  367. }
  368. int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root,
  369. uint64_t *keys, uint64_t *rkey, void *value_le)
  370. {
  371. unsigned level;
  372. int r = -ENODATA;
  373. __le64 internal_value_le;
  374. struct ro_spine spine;
  375. init_ro_spine(&spine, info);
  376. for (level = 0; level < info->levels - 1u; level++) {
  377. r = btree_lookup_raw(&spine, root, keys[level],
  378. lower_bound, rkey,
  379. &internal_value_le, sizeof(uint64_t));
  380. if (r)
  381. goto out;
  382. if (*rkey != keys[level]) {
  383. r = -ENODATA;
  384. goto out;
  385. }
  386. root = le64_to_cpu(internal_value_le);
  387. }
  388. r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le);
  389. out:
  390. exit_ro_spine(&spine);
  391. return r;
  392. }
  393. EXPORT_SYMBOL_GPL(dm_btree_lookup_next);
  394. /*
  395. * Splits a node by creating a sibling node and shifting half the nodes
  396. * contents across. Assumes there is a parent node, and it has room for
  397. * another child.
  398. *
  399. * Before:
  400. * +--------+
  401. * | Parent |
  402. * +--------+
  403. * |
  404. * v
  405. * +----------+
  406. * | A ++++++ |
  407. * +----------+
  408. *
  409. *
  410. * After:
  411. * +--------+
  412. * | Parent |
  413. * +--------+
  414. * | |
  415. * v +------+
  416. * +---------+ |
  417. * | A* +++ | v
  418. * +---------+ +-------+
  419. * | B +++ |
  420. * +-------+
  421. *
  422. * Where A* is a shadow of A.
  423. */
  424. static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
  425. uint64_t key)
  426. {
  427. int r;
  428. size_t size;
  429. unsigned nr_left, nr_right;
  430. struct dm_block *left, *right, *parent;
  431. struct btree_node *ln, *rn, *pn;
  432. __le64 location;
  433. left = shadow_current(s);
  434. r = new_block(s->info, &right);
  435. if (r < 0)
  436. return r;
  437. ln = dm_block_data(left);
  438. rn = dm_block_data(right);
  439. nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
  440. nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
  441. ln->header.nr_entries = cpu_to_le32(nr_left);
  442. rn->header.flags = ln->header.flags;
  443. rn->header.nr_entries = cpu_to_le32(nr_right);
  444. rn->header.max_entries = ln->header.max_entries;
  445. rn->header.value_size = ln->header.value_size;
  446. memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
  447. size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
  448. sizeof(uint64_t) : s->info->value_type.size;
  449. memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
  450. size * nr_right);
  451. /*
  452. * Patch up the parent
  453. */
  454. parent = shadow_parent(s);
  455. pn = dm_block_data(parent);
  456. location = cpu_to_le64(dm_block_location(left));
  457. __dm_bless_for_disk(&location);
  458. memcpy_disk(value_ptr(pn, parent_index),
  459. &location, sizeof(__le64));
  460. location = cpu_to_le64(dm_block_location(right));
  461. __dm_bless_for_disk(&location);
  462. r = insert_at(sizeof(__le64), pn, parent_index + 1,
  463. le64_to_cpu(rn->keys[0]), &location);
  464. if (r) {
  465. unlock_block(s->info, right);
  466. return r;
  467. }
  468. if (key < le64_to_cpu(rn->keys[0])) {
  469. unlock_block(s->info, right);
  470. s->nodes[1] = left;
  471. } else {
  472. unlock_block(s->info, left);
  473. s->nodes[1] = right;
  474. }
  475. return 0;
  476. }
  477. /*
  478. * Splits a node by creating two new children beneath the given node.
  479. *
  480. * Before:
  481. * +----------+
  482. * | A ++++++ |
  483. * +----------+
  484. *
  485. *
  486. * After:
  487. * +------------+
  488. * | A (shadow) |
  489. * +------------+
  490. * | |
  491. * +------+ +----+
  492. * | |
  493. * v v
  494. * +-------+ +-------+
  495. * | B +++ | | C +++ |
  496. * +-------+ +-------+
  497. */
  498. static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
  499. {
  500. int r;
  501. size_t size;
  502. unsigned nr_left, nr_right;
  503. struct dm_block *left, *right, *new_parent;
  504. struct btree_node *pn, *ln, *rn;
  505. __le64 val;
  506. new_parent = shadow_current(s);
  507. r = new_block(s->info, &left);
  508. if (r < 0)
  509. return r;
  510. r = new_block(s->info, &right);
  511. if (r < 0) {
  512. unlock_block(s->info, left);
  513. return r;
  514. }
  515. pn = dm_block_data(new_parent);
  516. ln = dm_block_data(left);
  517. rn = dm_block_data(right);
  518. nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
  519. nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
  520. ln->header.flags = pn->header.flags;
  521. ln->header.nr_entries = cpu_to_le32(nr_left);
  522. ln->header.max_entries = pn->header.max_entries;
  523. ln->header.value_size = pn->header.value_size;
  524. rn->header.flags = pn->header.flags;
  525. rn->header.nr_entries = cpu_to_le32(nr_right);
  526. rn->header.max_entries = pn->header.max_entries;
  527. rn->header.value_size = pn->header.value_size;
  528. memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
  529. memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
  530. size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
  531. sizeof(__le64) : s->info->value_type.size;
  532. memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
  533. memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
  534. nr_right * size);
  535. /* new_parent should just point to l and r now */
  536. pn->header.flags = cpu_to_le32(INTERNAL_NODE);
  537. pn->header.nr_entries = cpu_to_le32(2);
  538. pn->header.max_entries = cpu_to_le32(
  539. calc_max_entries(sizeof(__le64),
  540. dm_bm_block_size(
  541. dm_tm_get_bm(s->info->tm))));
  542. pn->header.value_size = cpu_to_le32(sizeof(__le64));
  543. val = cpu_to_le64(dm_block_location(left));
  544. __dm_bless_for_disk(&val);
  545. pn->keys[0] = ln->keys[0];
  546. memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
  547. val = cpu_to_le64(dm_block_location(right));
  548. __dm_bless_for_disk(&val);
  549. pn->keys[1] = rn->keys[0];
  550. memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
  551. unlock_block(s->info, left);
  552. unlock_block(s->info, right);
  553. return 0;
  554. }
  555. static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
  556. struct dm_btree_value_type *vt,
  557. uint64_t key, unsigned *index)
  558. {
  559. int r, i = *index, top = 1;
  560. struct btree_node *node;
  561. for (;;) {
  562. r = shadow_step(s, root, vt);
  563. if (r < 0)
  564. return r;
  565. node = dm_block_data(shadow_current(s));
  566. /*
  567. * We have to patch up the parent node, ugly, but I don't
  568. * see a way to do this automatically as part of the spine
  569. * op.
  570. */
  571. if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
  572. __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
  573. __dm_bless_for_disk(&location);
  574. memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
  575. &location, sizeof(__le64));
  576. }
  577. node = dm_block_data(shadow_current(s));
  578. if (node->header.nr_entries == node->header.max_entries) {
  579. if (top)
  580. r = btree_split_beneath(s, key);
  581. else
  582. r = btree_split_sibling(s, i, key);
  583. if (r < 0)
  584. return r;
  585. }
  586. node = dm_block_data(shadow_current(s));
  587. i = lower_bound(node, key);
  588. if (le32_to_cpu(node->header.flags) & LEAF_NODE)
  589. break;
  590. if (i < 0) {
  591. /* change the bounds on the lowest key */
  592. node->keys[0] = cpu_to_le64(key);
  593. i = 0;
  594. }
  595. root = value64(node, i);
  596. top = 0;
  597. }
  598. if (i < 0 || le64_to_cpu(node->keys[i]) != key)
  599. i++;
  600. *index = i;
  601. return 0;
  602. }
  603. static int insert(struct dm_btree_info *info, dm_block_t root,
  604. uint64_t *keys, void *value, dm_block_t *new_root,
  605. int *inserted)
  606. __dm_written_to_disk(value)
  607. {
  608. int r, need_insert;
  609. unsigned level, index = -1, last_level = info->levels - 1;
  610. dm_block_t block = root;
  611. struct shadow_spine spine;
  612. struct btree_node *n;
  613. struct dm_btree_value_type le64_type;
  614. init_le64_type(info->tm, &le64_type);
  615. init_shadow_spine(&spine, info);
  616. for (level = 0; level < (info->levels - 1); level++) {
  617. r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
  618. if (r < 0)
  619. goto bad;
  620. n = dm_block_data(shadow_current(&spine));
  621. need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
  622. (le64_to_cpu(n->keys[index]) != keys[level]));
  623. if (need_insert) {
  624. dm_block_t new_tree;
  625. __le64 new_le;
  626. r = dm_btree_empty(info, &new_tree);
  627. if (r < 0)
  628. goto bad;
  629. new_le = cpu_to_le64(new_tree);
  630. __dm_bless_for_disk(&new_le);
  631. r = insert_at(sizeof(uint64_t), n, index,
  632. keys[level], &new_le);
  633. if (r)
  634. goto bad;
  635. }
  636. if (level < last_level)
  637. block = value64(n, index);
  638. }
  639. r = btree_insert_raw(&spine, block, &info->value_type,
  640. keys[level], &index);
  641. if (r < 0)
  642. goto bad;
  643. n = dm_block_data(shadow_current(&spine));
  644. need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
  645. (le64_to_cpu(n->keys[index]) != keys[level]));
  646. if (need_insert) {
  647. if (inserted)
  648. *inserted = 1;
  649. r = insert_at(info->value_type.size, n, index,
  650. keys[level], value);
  651. if (r)
  652. goto bad_unblessed;
  653. } else {
  654. if (inserted)
  655. *inserted = 0;
  656. if (info->value_type.dec &&
  657. (!info->value_type.equal ||
  658. !info->value_type.equal(
  659. info->value_type.context,
  660. value_ptr(n, index),
  661. value))) {
  662. info->value_type.dec(info->value_type.context,
  663. value_ptr(n, index));
  664. }
  665. memcpy_disk(value_ptr(n, index),
  666. value, info->value_type.size);
  667. }
  668. *new_root = shadow_root(&spine);
  669. exit_shadow_spine(&spine);
  670. return 0;
  671. bad:
  672. __dm_unbless_for_disk(value);
  673. bad_unblessed:
  674. exit_shadow_spine(&spine);
  675. return r;
  676. }
  677. int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
  678. uint64_t *keys, void *value, dm_block_t *new_root)
  679. __dm_written_to_disk(value)
  680. {
  681. return insert(info, root, keys, value, new_root, NULL);
  682. }
  683. EXPORT_SYMBOL_GPL(dm_btree_insert);
  684. int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
  685. uint64_t *keys, void *value, dm_block_t *new_root,
  686. int *inserted)
  687. __dm_written_to_disk(value)
  688. {
  689. return insert(info, root, keys, value, new_root, inserted);
  690. }
  691. EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
  692. /*----------------------------------------------------------------*/
  693. static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
  694. uint64_t *result_key, dm_block_t *next_block)
  695. {
  696. int i, r;
  697. uint32_t flags;
  698. do {
  699. r = ro_step(s, block);
  700. if (r < 0)
  701. return r;
  702. flags = le32_to_cpu(ro_node(s)->header.flags);
  703. i = le32_to_cpu(ro_node(s)->header.nr_entries);
  704. if (!i)
  705. return -ENODATA;
  706. else
  707. i--;
  708. if (find_highest)
  709. *result_key = le64_to_cpu(ro_node(s)->keys[i]);
  710. else
  711. *result_key = le64_to_cpu(ro_node(s)->keys[0]);
  712. if (next_block || flags & INTERNAL_NODE) {
  713. if (find_highest)
  714. block = value64(ro_node(s), i);
  715. else
  716. block = value64(ro_node(s), 0);
  717. }
  718. } while (flags & INTERNAL_NODE);
  719. if (next_block)
  720. *next_block = block;
  721. return 0;
  722. }
  723. static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
  724. bool find_highest, uint64_t *result_keys)
  725. {
  726. int r = 0, count = 0, level;
  727. struct ro_spine spine;
  728. init_ro_spine(&spine, info);
  729. for (level = 0; level < info->levels; level++) {
  730. r = find_key(&spine, root, find_highest, result_keys + level,
  731. level == info->levels - 1 ? NULL : &root);
  732. if (r == -ENODATA) {
  733. r = 0;
  734. break;
  735. } else if (r)
  736. break;
  737. count++;
  738. }
  739. exit_ro_spine(&spine);
  740. return r ? r : count;
  741. }
  742. int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
  743. uint64_t *result_keys)
  744. {
  745. return dm_btree_find_key(info, root, true, result_keys);
  746. }
  747. EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
  748. int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
  749. uint64_t *result_keys)
  750. {
  751. return dm_btree_find_key(info, root, false, result_keys);
  752. }
  753. EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
  754. /*----------------------------------------------------------------*/
  755. /*
  756. * FIXME: We shouldn't use a recursive algorithm when we have limited stack
  757. * space. Also this only works for single level trees.
  758. */
  759. static int walk_node(struct dm_btree_info *info, dm_block_t block,
  760. int (*fn)(void *context, uint64_t *keys, void *leaf),
  761. void *context)
  762. {
  763. int r;
  764. unsigned i, nr;
  765. struct dm_block *node;
  766. struct btree_node *n;
  767. uint64_t keys;
  768. r = bn_read_lock(info, block, &node);
  769. if (r)
  770. return r;
  771. n = dm_block_data(node);
  772. nr = le32_to_cpu(n->header.nr_entries);
  773. for (i = 0; i < nr; i++) {
  774. if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
  775. r = walk_node(info, value64(n, i), fn, context);
  776. if (r)
  777. goto out;
  778. } else {
  779. keys = le64_to_cpu(*key_ptr(n, i));
  780. r = fn(context, &keys, value_ptr(n, i));
  781. if (r)
  782. goto out;
  783. }
  784. }
  785. out:
  786. dm_tm_unlock(info->tm, node);
  787. return r;
  788. }
  789. int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
  790. int (*fn)(void *context, uint64_t *keys, void *leaf),
  791. void *context)
  792. {
  793. BUG_ON(info->levels > 1);
  794. return walk_node(info, root, fn, context);
  795. }
  796. EXPORT_SYMBOL_GPL(dm_btree_walk);