bt_split.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829
  1. /*-
  2. * Copyright (c) 1990, 1993, 1994
  3. * The Regents of the University of California. All rights reserved.
  4. *
  5. * This code is derived from software contributed to Berkeley by
  6. * Mike Olson.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. * 1. Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in the
  15. * documentation and/or other materials provided with the distribution.
  16. * 3. All advertising materials mentioning features or use of this software
  17. * must display the following acknowledgement:
  18. * This product includes software developed by the University of
  19. * California, Berkeley and its contributors.
  20. * 4. Neither the name of the University nor the names of its contributors
  21. * may be used to endorse or promote products derived from this software
  22. * without specific prior written permission.
  23. *
  24. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  25. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  28. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  29. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  30. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  31. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  33. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34. * SUCH DAMAGE.
  35. */
  36. #if defined(LIBC_SCCS) && !defined(lint)
  37. static char sccsid[] = "@(#)bt_split.c 8.9 (Berkeley) 7/26/94";
  38. #endif /* LIBC_SCCS and not lint */
  39. #include <sys/types.h>
  40. #include <limits.h>
  41. #include <stdio.h>
  42. #include <stdlib.h>
  43. #include <string.h>
  44. #include "../include/db.h"
  45. #include "btree.h"
  46. static int bt_broot __P((BTREE *, PAGE *, PAGE *, PAGE *));
  47. static PAGE *bt_page
  48. __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t));
  49. static int bt_preserve __P((BTREE *, pgno_t));
  50. static PAGE *bt_psplit
  51. __P((BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t));
  52. static PAGE *bt_root
  53. __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t));
  54. static int bt_rroot __P((BTREE *, PAGE *, PAGE *, PAGE *));
  55. static recno_t rec_total __P((PAGE *));
  56. #ifdef STATISTICS
  57. u_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
  58. #endif
  59. /*
  60. * __BT_SPLIT -- Split the tree.
  61. *
  62. * Parameters:
  63. * t: tree
  64. * sp: page to split
  65. * key: key to insert
  66. * data: data to insert
  67. * flags: BIGKEY/BIGDATA flags
  68. * ilen: insert length
  69. * skip: index to leave open
  70. *
  71. * Returns:
  72. * RET_ERROR, RET_SUCCESS
  73. */
  74. int
  75. __bt_split(t, sp, key, data, flags, ilen, argskip)
  76. BTREE *t;
  77. PAGE *sp;
  78. const DBT *key, *data;
  79. int flags;
  80. size_t ilen;
  81. u_int32_t argskip;
  82. {
  83. BINTERNAL *bi = 0;
  84. BLEAF *bl = 0, *tbl;
  85. DBT a, b;
  86. EPGNO *parent;
  87. PAGE *h, *l, *r, *lchild, *rchild;
  88. indx_t nxtindex;
  89. u_int16_t skip;
  90. u_int32_t n, nbytes, nksize = 0;
  91. int parentsplit;
  92. char *dest;
  93. /*
  94. * Split the page into two pages, l and r. The split routines return
  95. * a pointer to the page into which the key should be inserted and with
  96. * skip set to the offset which should be used. Additionally, l and r
  97. * are pinned.
  98. */
  99. skip = argskip;
  100. h = sp->pgno == P_ROOT ?
  101. bt_root(t, sp, &l, &r, &skip, ilen) :
  102. bt_page(t, sp, &l, &r, &skip, ilen);
  103. if (h == NULL)
  104. return (RET_ERROR);
  105. /*
  106. * Insert the new key/data pair into the leaf page. (Key inserts
  107. * always cause a leaf page to split first.)
  108. */
  109. h->linp[skip] = h->upper -= ilen;
  110. dest = (char *)h + h->upper;
  111. if (F_ISSET(t, R_RECNO))
  112. WR_RLEAF(dest, data, flags)
  113. else
  114. WR_BLEAF(dest, key, data, flags)
  115. /* If the root page was split, make it look right. */
  116. if (sp->pgno == P_ROOT &&
  117. (F_ISSET(t, R_RECNO) ?
  118. bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
  119. goto err2;
  120. /*
  121. * Now we walk the parent page stack -- a LIFO stack of the pages that
  122. * were traversed when we searched for the page that split. Each stack
  123. * entry is a page number and a page index offset. The offset is for
  124. * the page traversed on the search. We've just split a page, so we
  125. * have to insert a new key into the parent page.
  126. *
  127. * If the insert into the parent page causes it to split, may have to
  128. * continue splitting all the way up the tree. We stop if the root
  129. * splits or the page inserted into didn't have to split to hold the
  130. * new key. Some algorithms replace the key for the old page as well
  131. * as the new page. We don't, as there's no reason to believe that the
  132. * first key on the old page is any better than the key we have, and,
  133. * in the case of a key being placed at index 0 causing the split, the
  134. * key is unavailable.
  135. *
  136. * There are a maximum of 5 pages pinned at any time. We keep the left
  137. * and right pages pinned while working on the parent. The 5 are the
  138. * two children, left parent and right parent (when the parent splits)
  139. * and the root page or the overflow key page when calling bt_preserve.
  140. * This code must make sure that all pins are released other than the
  141. * root page or overflow page which is unlocked elsewhere.
  142. */
  143. while ((parent = BT_POP(t)) != NULL) {
  144. lchild = l;
  145. rchild = r;
  146. /* Get the parent page. */
  147. if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
  148. goto err2;
  149. /*
  150. * The new key goes ONE AFTER the index, because the split
  151. * was to the right.
  152. */
  153. skip = parent->index + 1;
  154. /*
  155. * Calculate the space needed on the parent page.
  156. *
  157. * Prefix trees: space hack when inserting into BINTERNAL
  158. * pages. Retain only what's needed to distinguish between
  159. * the new entry and the LAST entry on the page to its left.
  160. * If the keys compare equal, retain the entire key. Note,
  161. * we don't touch overflow keys, and the entire key must be
  162. * retained for the next-to-left most key on the leftmost
  163. * page of each level, or the search will fail. Applicable
  164. * ONLY to internal pages that have leaf pages as children.
  165. * Further reduction of the key between pairs of internal
  166. * pages loses too much information.
  167. */
  168. switch (rchild->flags & P_TYPE) {
  169. case P_BINTERNAL:
  170. bi = GETBINTERNAL(rchild, 0);
  171. nbytes = NBINTERNAL(bi->ksize);
  172. break;
  173. case P_BLEAF:
  174. bl = GETBLEAF(rchild, 0);
  175. nbytes = NBINTERNAL(bl->ksize);
  176. if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
  177. (h->prevpg != P_INVALID || skip > 1)) {
  178. tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
  179. a.size = tbl->ksize;
  180. a.data = tbl->bytes;
  181. b.size = bl->ksize;
  182. b.data = bl->bytes;
  183. nksize = t->bt_pfx(&a, &b);
  184. n = NBINTERNAL(nksize);
  185. if (n < nbytes) {
  186. #ifdef STATISTICS
  187. bt_pfxsaved += nbytes - n;
  188. #endif
  189. nbytes = n;
  190. } else
  191. nksize = 0;
  192. } else
  193. nksize = 0;
  194. break;
  195. case P_RINTERNAL:
  196. case P_RLEAF:
  197. nbytes = NRINTERNAL;
  198. break;
  199. default:
  200. abort();
  201. }
  202. /* Split the parent page if necessary or shift the indices. */
  203. if ((u_int32_t) (h->upper - h->lower)
  204. < nbytes + sizeof(indx_t)) {
  205. sp = h;
  206. h = h->pgno == P_ROOT ?
  207. bt_root(t, h, &l, &r, &skip, nbytes) :
  208. bt_page(t, h, &l, &r, &skip, nbytes);
  209. if (h == NULL)
  210. goto err1;
  211. parentsplit = 1;
  212. } else {
  213. if (skip < (nxtindex = NEXTINDEX(h)))
  214. memmove(h->linp + skip + 1, h->linp + skip,
  215. (nxtindex - skip) * sizeof(indx_t));
  216. h->lower += sizeof(indx_t);
  217. parentsplit = 0;
  218. }
  219. /* Insert the key into the parent page. */
  220. switch (rchild->flags & P_TYPE) {
  221. case P_BINTERNAL:
  222. h->linp[skip] = h->upper -= nbytes;
  223. dest = (char *)h + h->linp[skip];
  224. memmove(dest, bi, nbytes);
  225. ((BINTERNAL *)dest)->pgno = rchild->pgno;
  226. break;
  227. case P_BLEAF:
  228. h->linp[skip] = h->upper -= nbytes;
  229. dest = (char *)h + h->linp[skip];
  230. WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
  231. rchild->pgno, bl->flags & P_BIGKEY);
  232. memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
  233. if (bl->flags & P_BIGKEY &&
  234. bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
  235. goto err1;
  236. break;
  237. case P_RINTERNAL:
  238. /*
  239. * Update the left page count. If split
  240. * added at index 0, fix the correct page.
  241. */
  242. if (skip > 0)
  243. dest = (char *)h + h->linp[skip - 1];
  244. else
  245. dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
  246. ((RINTERNAL *)dest)->nrecs = rec_total(lchild);
  247. ((RINTERNAL *)dest)->pgno = lchild->pgno;
  248. /* Update the right page count. */
  249. h->linp[skip] = h->upper -= nbytes;
  250. dest = (char *)h + h->linp[skip];
  251. ((RINTERNAL *)dest)->nrecs = rec_total(rchild);
  252. ((RINTERNAL *)dest)->pgno = rchild->pgno;
  253. break;
  254. case P_RLEAF:
  255. /*
  256. * Update the left page count. If split
  257. * added at index 0, fix the correct page.
  258. */
  259. if (skip > 0)
  260. dest = (char *)h + h->linp[skip - 1];
  261. else
  262. dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
  263. ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
  264. ((RINTERNAL *)dest)->pgno = lchild->pgno;
  265. /* Update the right page count. */
  266. h->linp[skip] = h->upper -= nbytes;
  267. dest = (char *)h + h->linp[skip];
  268. ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
  269. ((RINTERNAL *)dest)->pgno = rchild->pgno;
  270. break;
  271. default:
  272. abort();
  273. }
  274. /* Unpin the held pages. */
  275. if (!parentsplit) {
  276. mpool_put(t->bt_mp, h, MPOOL_DIRTY);
  277. break;
  278. }
  279. /* If the root page was split, make it look right. */
  280. if (sp->pgno == P_ROOT &&
  281. (F_ISSET(t, R_RECNO) ?
  282. bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
  283. goto err1;
  284. mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
  285. mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
  286. }
  287. /* Unpin the held pages. */
  288. mpool_put(t->bt_mp, l, MPOOL_DIRTY);
  289. mpool_put(t->bt_mp, r, MPOOL_DIRTY);
  290. /* Clear any pages left on the stack. */
  291. return (RET_SUCCESS);
  292. /*
  293. * If something fails in the above loop we were already walking back
  294. * up the tree and the tree is now inconsistent. Nothing much we can
  295. * do about it but release any memory we're holding.
  296. */
  297. err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
  298. mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
  299. err2: mpool_put(t->bt_mp, l, 0);
  300. mpool_put(t->bt_mp, r, 0);
  301. __dbpanic(t->bt_dbp);
  302. return (RET_ERROR);
  303. }
  304. /*
  305. * BT_PAGE -- Split a non-root page of a btree.
  306. *
  307. * Parameters:
  308. * t: tree
  309. * h: root page
  310. * lp: pointer to left page pointer
  311. * rp: pointer to right page pointer
  312. * skip: pointer to index to leave open
  313. * ilen: insert length
  314. *
  315. * Returns:
  316. * Pointer to page in which to insert or NULL on error.
  317. */
  318. static PAGE *
  319. bt_page(t, h, lp, rp, skip, ilen)
  320. BTREE *t;
  321. PAGE *h, **lp, **rp;
  322. indx_t *skip;
  323. size_t ilen;
  324. {
  325. PAGE *l, *r, *tp;
  326. pgno_t npg;
  327. #ifdef STATISTICS
  328. ++bt_split;
  329. #endif
  330. /* Put the new right page for the split into place. */
  331. if ((r = __bt_new(t, &npg)) == NULL)
  332. return (NULL);
  333. r->pgno = npg;
  334. r->lower = BTDATAOFF;
  335. r->upper = t->bt_psize;
  336. r->nextpg = h->nextpg;
  337. r->prevpg = h->pgno;
  338. r->flags = h->flags & P_TYPE;
  339. /*
  340. * If we're splitting the last page on a level because we're appending
  341. * a key to it (skip is NEXTINDEX()), it's likely that the data is
  342. * sorted. Adding an empty page on the side of the level is less work
  343. * and can push the fill factor much higher than normal. If we're
  344. * wrong it's no big deal, we'll just do the split the right way next
  345. * time. It may look like it's equally easy to do a similar hack for
  346. * reverse sorted data, that is, split the tree left, but it's not.
  347. * Don't even try.
  348. */
  349. if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
  350. #ifdef STATISTICS
  351. ++bt_sortsplit;
  352. #endif
  353. h->nextpg = r->pgno;
  354. r->lower = BTDATAOFF + sizeof(indx_t);
  355. *skip = 0;
  356. *lp = h;
  357. *rp = r;
  358. return (r);
  359. }
  360. /* Put the new left page for the split into place. */
  361. if ((l = (PAGE *)malloc(t->bt_psize)) == NULL) {
  362. mpool_put(t->bt_mp, r, 0);
  363. return (NULL);
  364. }
  365. #ifdef PURIFY
  366. memset(l, 0xff, t->bt_psize);
  367. #endif
  368. l->pgno = h->pgno;
  369. l->nextpg = r->pgno;
  370. l->prevpg = h->prevpg;
  371. l->lower = BTDATAOFF;
  372. l->upper = t->bt_psize;
  373. l->flags = h->flags & P_TYPE;
  374. /* Fix up the previous pointer of the page after the split page. */
  375. if (h->nextpg != P_INVALID) {
  376. if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
  377. free(l);
  378. /* XXX mpool_free(t->bt_mp, r->pgno); */
  379. return (NULL);
  380. }
  381. tp->prevpg = r->pgno;
  382. mpool_put(t->bt_mp, tp, MPOOL_DIRTY);
  383. }
  384. /*
  385. * Split right. The key/data pairs aren't sorted in the btree page so
  386. * it's simpler to copy the data from the split page onto two new pages
  387. * instead of copying half the data to the right page and compacting
  388. * the left page in place. Since the left page can't change, we have
  389. * to swap the original and the allocated left page after the split.
  390. */
  391. tp = bt_psplit(t, h, l, r, skip, ilen);
  392. /* Move the new left page onto the old left page. */
  393. memmove(h, l, t->bt_psize);
  394. if (tp == l)
  395. tp = h;
  396. free(l);
  397. *lp = h;
  398. *rp = r;
  399. return (tp);
  400. }
  401. /*
  402. * BT_ROOT -- Split the root page of a btree.
  403. *
  404. * Parameters:
  405. * t: tree
  406. * h: root page
  407. * lp: pointer to left page pointer
  408. * rp: pointer to right page pointer
  409. * skip: pointer to index to leave open
  410. * ilen: insert length
  411. *
  412. * Returns:
  413. * Pointer to page in which to insert or NULL on error.
  414. */
  415. static PAGE *
  416. bt_root(t, h, lp, rp, skip, ilen)
  417. BTREE *t;
  418. PAGE *h, **lp, **rp;
  419. indx_t *skip;
  420. size_t ilen;
  421. {
  422. PAGE *l, *r, *tp;
  423. pgno_t lnpg, rnpg;
  424. #ifdef STATISTICS
  425. ++bt_split;
  426. ++bt_rootsplit;
  427. #endif
  428. /* Put the new left and right pages for the split into place. */
  429. if ((l = __bt_new(t, &lnpg)) == NULL ||
  430. (r = __bt_new(t, &rnpg)) == NULL)
  431. return (NULL);
  432. l->pgno = lnpg;
  433. r->pgno = rnpg;
  434. l->nextpg = r->pgno;
  435. r->prevpg = l->pgno;
  436. l->prevpg = r->nextpg = P_INVALID;
  437. l->lower = r->lower = BTDATAOFF;
  438. l->upper = r->upper = t->bt_psize;
  439. l->flags = r->flags = h->flags & P_TYPE;
  440. /* Split the root page. */
  441. tp = bt_psplit(t, h, l, r, skip, ilen);
  442. *lp = l;
  443. *rp = r;
  444. return (tp);
  445. }
  446. /*
  447. * BT_RROOT -- Fix up the recno root page after it has been split.
  448. *
  449. * Parameters:
  450. * t: tree
  451. * h: root page
  452. * l: left page
  453. * r: right page
  454. *
  455. * Returns:
  456. * RET_ERROR, RET_SUCCESS
  457. */
  458. static int
  459. bt_rroot(t, h, l, r)
  460. BTREE *t;
  461. PAGE *h, *l, *r;
  462. {
  463. char *dest;
  464. /* Insert the left and right keys, set the header information. */
  465. h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
  466. dest = (char *)h + h->upper;
  467. WR_RINTERNAL(dest,
  468. l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
  469. h->linp[1] = h->upper -= NRINTERNAL;
  470. dest = (char *)h + h->upper;
  471. WR_RINTERNAL(dest,
  472. r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
  473. h->lower = BTDATAOFF + 2 * sizeof(indx_t);
  474. /* Unpin the root page, set to recno internal page. */
  475. h->flags &= ~P_TYPE;
  476. h->flags |= P_RINTERNAL;
  477. mpool_put(t->bt_mp, h, MPOOL_DIRTY);
  478. return (RET_SUCCESS);
  479. }
  480. /*
  481. * BT_BROOT -- Fix up the btree root page after it has been split.
  482. *
  483. * Parameters:
  484. * t: tree
  485. * h: root page
  486. * l: left page
  487. * r: right page
  488. *
  489. * Returns:
  490. * RET_ERROR, RET_SUCCESS
  491. */
  492. static int
  493. bt_broot(t, h, l, r)
  494. BTREE *t;
  495. PAGE *h, *l, *r;
  496. {
  497. BINTERNAL *bi;
  498. BLEAF *bl;
  499. u_int32_t nbytes;
  500. char *dest;
  501. /*
  502. * If the root page was a leaf page, change it into an internal page.
  503. * We copy the key we split on (but not the key's data, in the case of
  504. * a leaf page) to the new root page.
  505. *
  506. * The btree comparison code guarantees that the left-most key on any
  507. * level of the tree is never used, so it doesn't need to be filled in.
  508. */
  509. nbytes = NBINTERNAL(0);
  510. h->linp[0] = h->upper = t->bt_psize - nbytes;
  511. dest = (char *)h + h->upper;
  512. WR_BINTERNAL(dest, 0, l->pgno, 0);
  513. switch (h->flags & P_TYPE) {
  514. case P_BLEAF:
  515. bl = GETBLEAF(r, 0);
  516. nbytes = NBINTERNAL(bl->ksize);
  517. h->linp[1] = h->upper -= nbytes;
  518. dest = (char *)h + h->upper;
  519. WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
  520. memmove(dest, bl->bytes, bl->ksize);
  521. /*
  522. * If the key is on an overflow page, mark the overflow chain
  523. * so it isn't deleted when the leaf copy of the key is deleted.
  524. */
  525. if (bl->flags & P_BIGKEY &&
  526. bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
  527. return (RET_ERROR);
  528. break;
  529. case P_BINTERNAL:
  530. bi = GETBINTERNAL(r, 0);
  531. nbytes = NBINTERNAL(bi->ksize);
  532. h->linp[1] = h->upper -= nbytes;
  533. dest = (char *)h + h->upper;
  534. memmove(dest, bi, nbytes);
  535. ((BINTERNAL *)dest)->pgno = r->pgno;
  536. break;
  537. default:
  538. abort();
  539. }
  540. /* There are two keys on the page. */
  541. h->lower = BTDATAOFF + 2 * sizeof(indx_t);
  542. /* Unpin the root page, set to btree internal page. */
  543. h->flags &= ~P_TYPE;
  544. h->flags |= P_BINTERNAL;
  545. mpool_put(t->bt_mp, h, MPOOL_DIRTY);
  546. return (RET_SUCCESS);
  547. }
  548. /*
  549. * BT_PSPLIT -- Do the real work of splitting the page.
  550. *
  551. * Parameters:
  552. * t: tree
  553. * h: page to be split
  554. * l: page to put lower half of data
  555. * r: page to put upper half of data
  556. * pskip: pointer to index to leave open
  557. * ilen: insert length
  558. *
  559. * Returns:
  560. * Pointer to page in which to insert.
  561. */
  562. static PAGE *
  563. bt_psplit(t, h, l, r, pskip, ilen)
  564. BTREE *t;
  565. PAGE *h, *l, *r;
  566. indx_t *pskip;
  567. size_t ilen;
  568. {
  569. BINTERNAL *bi;
  570. BLEAF *bl;
  571. CURSOR *c;
  572. RLEAF *rl;
  573. PAGE *rval;
  574. void *src = 0;
  575. indx_t full, half, nxt, off, skip, top, used;
  576. u_int32_t nbytes;
  577. int bigkeycnt, isbigkey;
  578. /*
  579. * Split the data to the left and right pages. Leave the skip index
  580. * open. Additionally, make some effort not to split on an overflow
  581. * key. This makes internal page processing faster and can save
  582. * space as overflow keys used by internal pages are never deleted.
  583. */
  584. bigkeycnt = 0;
  585. skip = *pskip;
  586. full = t->bt_psize - BTDATAOFF;
  587. half = full / 2;
  588. used = 0;
  589. for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
  590. if (skip == off) {
  591. nbytes = ilen;
  592. isbigkey = 0; /* XXX: not really known. */
  593. } else
  594. switch (h->flags & P_TYPE) {
  595. case P_BINTERNAL:
  596. src = bi = GETBINTERNAL(h, nxt);
  597. nbytes = NBINTERNAL(bi->ksize);
  598. isbigkey = bi->flags & P_BIGKEY;
  599. break;
  600. case P_BLEAF:
  601. src = bl = GETBLEAF(h, nxt);
  602. nbytes = NBLEAF(bl);
  603. isbigkey = bl->flags & P_BIGKEY;
  604. break;
  605. case P_RINTERNAL:
  606. src = GETRINTERNAL(h, nxt);
  607. nbytes = NRINTERNAL;
  608. isbigkey = 0;
  609. break;
  610. case P_RLEAF:
  611. src = rl = GETRLEAF(h, nxt);
  612. nbytes = NRLEAF(rl);
  613. isbigkey = 0;
  614. break;
  615. default:
  616. abort();
  617. }
  618. /*
  619. * If the key/data pairs are substantial fractions of the max
  620. * possible size for the page, it's possible to get situations
  621. * where we decide to try and copy too much onto the left page.
  622. * Make sure that doesn't happen.
  623. */
  624. if ((skip <= off && used + nbytes + sizeof(indx_t) >= full)
  625. || nxt == top - 1) {
  626. --off;
  627. break;
  628. }
  629. /* Copy the key/data pair, if not the skipped index. */
  630. if (skip != off) {
  631. ++nxt;
  632. l->linp[off] = l->upper -= nbytes;
  633. memmove((char *)l + l->upper, src, nbytes);
  634. }
  635. used += nbytes + sizeof(indx_t);
  636. if (used >= half) {
  637. if (!isbigkey || bigkeycnt == 3)
  638. break;
  639. else
  640. ++bigkeycnt;
  641. }
  642. }
  643. /*
  644. * Off is the last offset that's valid for the left page.
  645. * Nxt is the first offset to be placed on the right page.
  646. */
  647. l->lower += (off + 1) * sizeof(indx_t);
  648. /*
  649. * If splitting the page that the cursor was on, the cursor has to be
  650. * adjusted to point to the same record as before the split. If the
  651. * cursor is at or past the skipped slot, the cursor is incremented by
  652. * one. If the cursor is on the right page, it is decremented by the
  653. * number of records split to the left page.
  654. */
  655. c = &t->bt_cursor;
  656. if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) {
  657. if (c->pg.index >= skip)
  658. ++c->pg.index;
  659. if (c->pg.index < nxt) /* Left page. */
  660. c->pg.pgno = l->pgno;
  661. else { /* Right page. */
  662. c->pg.pgno = r->pgno;
  663. c->pg.index -= nxt;
  664. }
  665. }
  666. /*
  667. * If the skipped index was on the left page, just return that page.
  668. * Otherwise, adjust the skip index to reflect the new position on
  669. * the right page.
  670. */
  671. if (skip <= off) {
  672. skip = 0;
  673. rval = l;
  674. } else {
  675. rval = r;
  676. *pskip -= nxt;
  677. }
  678. for (off = 0; nxt < top; ++off) {
  679. if (skip == nxt) {
  680. ++off;
  681. skip = 0;
  682. }
  683. switch (h->flags & P_TYPE) {
  684. case P_BINTERNAL:
  685. src = bi = GETBINTERNAL(h, nxt);
  686. nbytes = NBINTERNAL(bi->ksize);
  687. break;
  688. case P_BLEAF:
  689. src = bl = GETBLEAF(h, nxt);
  690. nbytes = NBLEAF(bl);
  691. break;
  692. case P_RINTERNAL:
  693. src = GETRINTERNAL(h, nxt);
  694. nbytes = NRINTERNAL;
  695. break;
  696. case P_RLEAF:
  697. src = rl = GETRLEAF(h, nxt);
  698. nbytes = NRLEAF(rl);
  699. break;
  700. default:
  701. abort();
  702. }
  703. ++nxt;
  704. r->linp[off] = r->upper -= nbytes;
  705. memmove((char *)r + r->upper, src, nbytes);
  706. }
  707. r->lower += off * sizeof(indx_t);
  708. /* If the key is being appended to the page, adjust the index. */
  709. if (skip == top)
  710. r->lower += sizeof(indx_t);
  711. return (rval);
  712. }
  713. /*
  714. * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
  715. *
  716. * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
  717. * record that references them gets deleted. Chains pointed to by internal
  718. * pages never get deleted. This routine marks a chain as pointed to by an
  719. * internal page.
  720. *
  721. * Parameters:
  722. * t: tree
  723. * pg: page number of first page in the chain.
  724. *
  725. * Returns:
  726. * RET_SUCCESS, RET_ERROR.
  727. */
  728. static int
  729. bt_preserve(t, pg)
  730. BTREE *t;
  731. pgno_t pg;
  732. {
  733. PAGE *h;
  734. if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
  735. return (RET_ERROR);
  736. h->flags |= P_PRESERVE;
  737. mpool_put(t->bt_mp, h, MPOOL_DIRTY);
  738. return (RET_SUCCESS);
  739. }
  740. /*
  741. * REC_TOTAL -- Return the number of recno entries below a page.
  742. *
  743. * Parameters:
  744. * h: page
  745. *
  746. * Returns:
  747. * The number of recno entries below a page.
  748. *
  749. * XXX
  750. * These values could be set by the bt_psplit routine. The problem is that the
  751. * entry has to be popped off of the stack etc. or the values have to be passed
  752. * all the way back to bt_split/bt_rroot and it's not very clean.
  753. */
  754. static recno_t
  755. rec_total(h)
  756. PAGE *h;
  757. {
  758. recno_t recs;
  759. indx_t nxt, top;
  760. for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
  761. recs += GETRINTERNAL(h, nxt)->nrecs;
  762. return (recs);
  763. }