bset.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566
  1. #ifndef _BCACHE_BSET_H
  2. #define _BCACHE_BSET_H
  3. #include <linux/bcache.h>
  4. #include <linux/kernel.h>
  5. #include <linux/types.h>
  6. #include "util.h" /* for time_stats */
  7. /*
  8. * BKEYS:
  9. *
  10. * A bkey contains a key, a size field, a variable number of pointers, and some
  11. * ancillary flag bits.
  12. *
  13. * We use two different functions for validating bkeys, bch_ptr_invalid and
  14. * bch_ptr_bad().
  15. *
  16. * bch_ptr_invalid() primarily filters out keys and pointers that would be
  17. * invalid due to some sort of bug, whereas bch_ptr_bad() filters out keys and
  18. * pointer that occur in normal practice but don't point to real data.
  19. *
  20. * The one exception to the rule that ptr_invalid() filters out invalid keys is
  21. * that it also filters out keys of size 0 - these are keys that have been
  22. * completely overwritten. It'd be safe to delete these in memory while leaving
  23. * them on disk, just unnecessary work - so we filter them out when resorting
  24. * instead.
  25. *
  26. * We can't filter out stale keys when we're resorting, because garbage
  27. * collection needs to find them to ensure bucket gens don't wrap around -
  28. * unless we're rewriting the btree node those stale keys still exist on disk.
  29. *
  30. * We also implement functions here for removing some number of sectors from the
  31. * front or the back of a bkey - this is mainly used for fixing overlapping
  32. * extents, by removing the overlapping sectors from the older key.
  33. *
  34. * BSETS:
  35. *
  36. * A bset is an array of bkeys laid out contiguously in memory in sorted order,
  37. * along with a header. A btree node is made up of a number of these, written at
  38. * different times.
  39. *
  40. * There could be many of them on disk, but we never allow there to be more than
  41. * 4 in memory - we lazily resort as needed.
  42. *
  43. * We implement code here for creating and maintaining auxiliary search trees
  44. * (described below) for searching an individial bset, and on top of that we
  45. * implement a btree iterator.
  46. *
  47. * BTREE ITERATOR:
  48. *
  49. * Most of the code in bcache doesn't care about an individual bset - it needs
  50. * to search entire btree nodes and iterate over them in sorted order.
  51. *
  52. * The btree iterator code serves both functions; it iterates through the keys
  53. * in a btree node in sorted order, starting from either keys after a specific
  54. * point (if you pass it a search key) or the start of the btree node.
  55. *
  56. * AUXILIARY SEARCH TREES:
  57. *
  58. * Since keys are variable length, we can't use a binary search on a bset - we
  59. * wouldn't be able to find the start of the next key. But binary searches are
  60. * slow anyways, due to terrible cache behaviour; bcache originally used binary
  61. * searches and that code topped out at under 50k lookups/second.
  62. *
  63. * So we need to construct some sort of lookup table. Since we only insert keys
  64. * into the last (unwritten) set, most of the keys within a given btree node are
  65. * usually in sets that are mostly constant. We use two different types of
  66. * lookup tables to take advantage of this.
  67. *
  68. * Both lookup tables share in common that they don't index every key in the
  69. * set; they index one key every BSET_CACHELINE bytes, and then a linear search
  70. * is used for the rest.
  71. *
  72. * For sets that have been written to disk and are no longer being inserted
  73. * into, we construct a binary search tree in an array - traversing a binary
  74. * search tree in an array gives excellent locality of reference and is very
  75. * fast, since both children of any node are adjacent to each other in memory
  76. * (and their grandchildren, and great grandchildren...) - this means
  77. * prefetching can be used to great effect.
  78. *
  79. * It's quite useful performance wise to keep these nodes small - not just
  80. * because they're more likely to be in L2, but also because we can prefetch
  81. * more nodes on a single cacheline and thus prefetch more iterations in advance
  82. * when traversing this tree.
  83. *
  84. * Nodes in the auxiliary search tree must contain both a key to compare against
  85. * (we don't want to fetch the key from the set, that would defeat the purpose),
  86. * and a pointer to the key. We use a few tricks to compress both of these.
  87. *
  88. * To compress the pointer, we take advantage of the fact that one node in the
  89. * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
  90. * a function (to_inorder()) that takes the index of a node in a binary tree and
  91. * returns what its index would be in an inorder traversal, so we only have to
  92. * store the low bits of the offset.
  93. *
  94. * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
  95. * compress that, we take advantage of the fact that when we're traversing the
  96. * search tree at every iteration we know that both our search key and the key
  97. * we're looking for lie within some range - bounded by our previous
  98. * comparisons. (We special case the start of a search so that this is true even
  99. * at the root of the tree).
  100. *
  101. * So we know the key we're looking for is between a and b, and a and b don't
  102. * differ higher than bit 50, we don't need to check anything higher than bit
  103. * 50.
  104. *
  105. * We don't usually need the rest of the bits, either; we only need enough bits
  106. * to partition the key range we're currently checking. Consider key n - the
  107. * key our auxiliary search tree node corresponds to, and key p, the key
  108. * immediately preceding n. The lowest bit we need to store in the auxiliary
  109. * search tree is the highest bit that differs between n and p.
  110. *
  111. * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
  112. * comparison. But we'd really like our nodes in the auxiliary search tree to be
  113. * of fixed size.
  114. *
  115. * The solution is to make them fixed size, and when we're constructing a node
  116. * check if p and n differed in the bits we needed them to. If they don't we
  117. * flag that node, and when doing lookups we fallback to comparing against the
  118. * real key. As long as this doesn't happen to often (and it seems to reliably
  119. * happen a bit less than 1% of the time), we win - even on failures, that key
  120. * is then more likely to be in cache than if we were doing binary searches all
  121. * the way, since we're touching so much less memory.
  122. *
  123. * The keys in the auxiliary search tree are stored in (software) floating
  124. * point, with an exponent and a mantissa. The exponent needs to be big enough
  125. * to address all the bits in the original key, but the number of bits in the
  126. * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
  127. *
  128. * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
  129. * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
  130. * We need one node per 128 bytes in the btree node, which means the auxiliary
  131. * search trees take up 3% as much memory as the btree itself.
  132. *
  133. * Constructing these auxiliary search trees is moderately expensive, and we
  134. * don't want to be constantly rebuilding the search tree for the last set
  135. * whenever we insert another key into it. For the unwritten set, we use a much
  136. * simpler lookup table - it's just a flat array, so index i in the lookup table
  137. * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
  138. * within each byte range works the same as with the auxiliary search trees.
  139. *
  140. * These are much easier to keep up to date when we insert a key - we do it
  141. * somewhat lazily; when we shift a key up we usually just increment the pointer
  142. * to it, only when it would overflow do we go to the trouble of finding the
  143. * first key in that range of bytes again.
  144. */
  145. struct btree_keys;
  146. struct btree_iter;
  147. struct btree_iter_set;
  148. struct bkey_float;
  149. #define MAX_BSETS 4U
  150. struct bset_tree {
  151. /*
  152. * We construct a binary tree in an array as if the array
  153. * started at 1, so that things line up on the same cachelines
  154. * better: see comments in bset.c at cacheline_to_bkey() for
  155. * details
  156. */
  157. /* size of the binary tree and prev array */
  158. unsigned size;
  159. /* function of size - precalculated for to_inorder() */
  160. unsigned extra;
  161. /* copy of the last key in the set */
  162. struct bkey end;
  163. struct bkey_float *tree;
  164. /*
  165. * The nodes in the bset tree point to specific keys - this
  166. * array holds the sizes of the previous key.
  167. *
  168. * Conceptually it's a member of struct bkey_float, but we want
  169. * to keep bkey_float to 4 bytes and prev isn't used in the fast
  170. * path.
  171. */
  172. uint8_t *prev;
  173. /* The actual btree node, with pointers to each sorted set */
  174. struct bset *data;
  175. };
  176. struct btree_keys_ops {
  177. bool (*sort_cmp)(struct btree_iter_set,
  178. struct btree_iter_set);
  179. struct bkey *(*sort_fixup)(struct btree_iter *, struct bkey *);
  180. bool (*insert_fixup)(struct btree_keys *, struct bkey *,
  181. struct btree_iter *, struct bkey *);
  182. bool (*key_invalid)(struct btree_keys *,
  183. const struct bkey *);
  184. bool (*key_bad)(struct btree_keys *, const struct bkey *);
  185. bool (*key_merge)(struct btree_keys *,
  186. struct bkey *, struct bkey *);
  187. void (*key_to_text)(char *, size_t, const struct bkey *);
  188. void (*key_dump)(struct btree_keys *, const struct bkey *);
  189. /*
  190. * Only used for deciding whether to use START_KEY(k) or just the key
  191. * itself in a couple places
  192. */
  193. bool is_extents;
  194. };
  195. struct btree_keys {
  196. const struct btree_keys_ops *ops;
  197. uint8_t page_order;
  198. uint8_t nsets;
  199. unsigned last_set_unwritten:1;
  200. bool *expensive_debug_checks;
  201. /*
  202. * Sets of sorted keys - the real btree node - plus a binary search tree
  203. *
  204. * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
  205. * to the memory we have allocated for this btree node. Additionally,
  206. * set[0]->data points to the entire btree node as it exists on disk.
  207. */
  208. struct bset_tree set[MAX_BSETS];
  209. };
  210. static inline struct bset_tree *bset_tree_last(struct btree_keys *b)
  211. {
  212. return b->set + b->nsets;
  213. }
  214. static inline bool bset_written(struct btree_keys *b, struct bset_tree *t)
  215. {
  216. return t <= b->set + b->nsets - b->last_set_unwritten;
  217. }
  218. static inline bool bkey_written(struct btree_keys *b, struct bkey *k)
  219. {
  220. return !b->last_set_unwritten || k < b->set[b->nsets].data->start;
  221. }
  222. static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i)
  223. {
  224. return ((size_t) i) - ((size_t) b->set->data);
  225. }
  226. static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i)
  227. {
  228. return bset_byte_offset(b, i) >> 9;
  229. }
  230. #define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t))
  231. #define set_bytes(i) __set_bytes(i, i->keys)
  232. #define __set_blocks(i, k, block_bytes) \
  233. DIV_ROUND_UP(__set_bytes(i, k), block_bytes)
  234. #define set_blocks(i, block_bytes) \
  235. __set_blocks(i, (i)->keys, block_bytes)
  236. static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b)
  237. {
  238. struct bset_tree *t = bset_tree_last(b);
  239. BUG_ON((PAGE_SIZE << b->page_order) <
  240. (bset_byte_offset(b, t->data) + set_bytes(t->data)));
  241. if (!b->last_set_unwritten)
  242. return 0;
  243. return ((PAGE_SIZE << b->page_order) -
  244. (bset_byte_offset(b, t->data) + set_bytes(t->data))) /
  245. sizeof(u64);
  246. }
  247. static inline struct bset *bset_next_set(struct btree_keys *b,
  248. unsigned block_bytes)
  249. {
  250. struct bset *i = bset_tree_last(b)->data;
  251. return ((void *) i) + roundup(set_bytes(i), block_bytes);
  252. }
  253. void bch_btree_keys_free(struct btree_keys *);
  254. int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t);
  255. void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *,
  256. bool *);
  257. void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t);
  258. void bch_bset_build_written_tree(struct btree_keys *);
  259. void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *);
  260. bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *);
  261. void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *);
  262. unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *,
  263. struct bkey *);
  264. enum {
  265. BTREE_INSERT_STATUS_NO_INSERT = 0,
  266. BTREE_INSERT_STATUS_INSERT,
  267. BTREE_INSERT_STATUS_BACK_MERGE,
  268. BTREE_INSERT_STATUS_OVERWROTE,
  269. BTREE_INSERT_STATUS_FRONT_MERGE,
  270. };
  271. /* Btree key iteration */
  272. struct btree_iter {
  273. size_t size, used;
  274. #ifdef CONFIG_BCACHE_DEBUG
  275. struct btree_keys *b;
  276. #endif
  277. struct btree_iter_set {
  278. struct bkey *k, *end;
  279. } data[MAX_BSETS];
  280. };
  281. typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *);
  282. struct bkey *bch_btree_iter_next(struct btree_iter *);
  283. struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
  284. struct btree_keys *, ptr_filter_fn);
  285. void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
  286. struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *,
  287. struct bkey *);
  288. struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *,
  289. const struct bkey *);
  290. /*
  291. * Returns the first key that is strictly greater than search
  292. */
  293. static inline struct bkey *bch_bset_search(struct btree_keys *b,
  294. struct bset_tree *t,
  295. const struct bkey *search)
  296. {
  297. return search ? __bch_bset_search(b, t, search) : t->data->start;
  298. }
  299. #define for_each_key_filter(b, k, iter, filter) \
  300. for (bch_btree_iter_init((b), (iter), NULL); \
  301. ((k) = bch_btree_iter_next_filter((iter), (b), filter));)
  302. #define for_each_key(b, k, iter) \
  303. for (bch_btree_iter_init((b), (iter), NULL); \
  304. ((k) = bch_btree_iter_next(iter));)
  305. /* Sorting */
  306. struct bset_sort_state {
  307. mempool_t *pool;
  308. unsigned page_order;
  309. unsigned crit_factor;
  310. struct time_stats time;
  311. };
  312. void bch_bset_sort_state_free(struct bset_sort_state *);
  313. int bch_bset_sort_state_init(struct bset_sort_state *, unsigned);
  314. void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *);
  315. void bch_btree_sort_into(struct btree_keys *, struct btree_keys *,
  316. struct bset_sort_state *);
  317. void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *,
  318. struct bset_sort_state *);
  319. void bch_btree_sort_partial(struct btree_keys *, unsigned,
  320. struct bset_sort_state *);
  321. static inline void bch_btree_sort(struct btree_keys *b,
  322. struct bset_sort_state *state)
  323. {
  324. bch_btree_sort_partial(b, 0, state);
  325. }
  326. struct bset_stats {
  327. size_t sets_written, sets_unwritten;
  328. size_t bytes_written, bytes_unwritten;
  329. size_t floats, failed;
  330. };
  331. void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *);
  332. /* Bkey utility code */
  333. #define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys)
  334. static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx)
  335. {
  336. return bkey_idx(i->start, idx);
  337. }
  338. static inline void bkey_init(struct bkey *k)
  339. {
  340. *k = ZERO_KEY;
  341. }
  342. static __always_inline int64_t bkey_cmp(const struct bkey *l,
  343. const struct bkey *r)
  344. {
  345. return unlikely(KEY_INODE(l) != KEY_INODE(r))
  346. ? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r)
  347. : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
  348. }
  349. void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
  350. unsigned);
  351. bool __bch_cut_front(const struct bkey *, struct bkey *);
  352. bool __bch_cut_back(const struct bkey *, struct bkey *);
  353. static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
  354. {
  355. BUG_ON(bkey_cmp(where, k) > 0);
  356. return __bch_cut_front(where, k);
  357. }
  358. static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
  359. {
  360. BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
  361. return __bch_cut_back(where, k);
  362. }
  363. #define PRECEDING_KEY(_k) \
  364. ({ \
  365. struct bkey *_ret = NULL; \
  366. \
  367. if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \
  368. _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \
  369. \
  370. if (!_ret->low) \
  371. _ret->high--; \
  372. _ret->low--; \
  373. } \
  374. \
  375. _ret; \
  376. })
  377. static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k)
  378. {
  379. return b->ops->key_invalid(b, k);
  380. }
  381. static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k)
  382. {
  383. return b->ops->key_bad(b, k);
  384. }
  385. static inline void bch_bkey_to_text(struct btree_keys *b, char *buf,
  386. size_t size, const struct bkey *k)
  387. {
  388. return b->ops->key_to_text(buf, size, k);
  389. }
  390. static inline bool bch_bkey_equal_header(const struct bkey *l,
  391. const struct bkey *r)
  392. {
  393. return (KEY_DIRTY(l) == KEY_DIRTY(r) &&
  394. KEY_PTRS(l) == KEY_PTRS(r) &&
  395. KEY_CSUM(l) == KEY_CSUM(r));
  396. }
  397. /* Keylists */
  398. struct keylist {
  399. union {
  400. struct bkey *keys;
  401. uint64_t *keys_p;
  402. };
  403. union {
  404. struct bkey *top;
  405. uint64_t *top_p;
  406. };
  407. /* Enough room for btree_split's keys without realloc */
  408. #define KEYLIST_INLINE 16
  409. uint64_t inline_keys[KEYLIST_INLINE];
  410. };
  411. static inline void bch_keylist_init(struct keylist *l)
  412. {
  413. l->top_p = l->keys_p = l->inline_keys;
  414. }
  415. static inline void bch_keylist_init_single(struct keylist *l, struct bkey *k)
  416. {
  417. l->keys = k;
  418. l->top = bkey_next(k);
  419. }
  420. static inline void bch_keylist_push(struct keylist *l)
  421. {
  422. l->top = bkey_next(l->top);
  423. }
  424. static inline void bch_keylist_add(struct keylist *l, struct bkey *k)
  425. {
  426. bkey_copy(l->top, k);
  427. bch_keylist_push(l);
  428. }
  429. static inline bool bch_keylist_empty(struct keylist *l)
  430. {
  431. return l->top == l->keys;
  432. }
  433. static inline void bch_keylist_reset(struct keylist *l)
  434. {
  435. l->top = l->keys;
  436. }
  437. static inline void bch_keylist_free(struct keylist *l)
  438. {
  439. if (l->keys_p != l->inline_keys)
  440. kfree(l->keys_p);
  441. }
  442. static inline size_t bch_keylist_nkeys(struct keylist *l)
  443. {
  444. return l->top_p - l->keys_p;
  445. }
  446. static inline size_t bch_keylist_bytes(struct keylist *l)
  447. {
  448. return bch_keylist_nkeys(l) * sizeof(uint64_t);
  449. }
  450. struct bkey *bch_keylist_pop(struct keylist *);
  451. void bch_keylist_pop_front(struct keylist *);
  452. int __bch_keylist_realloc(struct keylist *, unsigned);
  453. /* Debug stuff */
  454. #ifdef CONFIG_BCACHE_DEBUG
  455. int __bch_count_data(struct btree_keys *);
  456. void __bch_check_keys(struct btree_keys *, const char *, ...);
  457. void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
  458. void bch_dump_bucket(struct btree_keys *);
  459. #else
  460. static inline int __bch_count_data(struct btree_keys *b) { return -1; }
  461. static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {}
  462. static inline void bch_dump_bucket(struct btree_keys *b) {}
  463. void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
  464. #endif
  465. static inline bool btree_keys_expensive_checks(struct btree_keys *b)
  466. {
  467. #ifdef CONFIG_BCACHE_DEBUG
  468. return *b->expensive_debug_checks;
  469. #else
  470. return false;
  471. #endif
  472. }
  473. static inline int bch_count_data(struct btree_keys *b)
  474. {
  475. return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1;
  476. }
  477. #define bch_check_keys(b, ...) \
  478. do { \
  479. if (btree_keys_expensive_checks(b)) \
  480. __bch_check_keys(b, __VA_ARGS__); \
  481. } while (0)
  482. #endif