in sqlite/sqlite3_debug.c [72335:73099]
static int balance_nonroot(
MemPage *pParent, /* Parent page of siblings being balanced */
int iParentIdx, /* Index of "the page" in pParent */
u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */
int isRoot, /* True if pParent is a root-page */
int bBulk /* True if this call is part of a bulk load */
){
BtShared *pBt; /* The whole database */
int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */
int nNew = 0; /* Number of pages in apNew[] */
int nOld; /* Number of pages in apOld[] */
int i, j, k; /* Loop counters */
int nxDiv; /* Next divider slot in pParent->aCell[] */
int rc = SQLITE_OK; /* The return code */
u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */
int leafData; /* True if pPage is a leaf of a LEAFDATA tree */
int usableSpace; /* Bytes in pPage beyond the header */
int pageFlags; /* Value of pPage->aData[0] */
int iSpace1 = 0; /* First unused byte of aSpace1[] */
int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */
int szScratch; /* Size of scratch memory requested */
MemPage *apOld[NB]; /* pPage and up to two siblings */
MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
u8 *pRight; /* Location in parent of right-sibling pointer */
u8 *apDiv[NB-1]; /* Divider cells in pParent */
int cntNew[NB+2]; /* Index in b.paCell[] of cell after i-th page */
int cntOld[NB+2]; /* Old index in b.apCell[] */
int szNew[NB+2]; /* Combined size of cells placed on i-th page */
u8 *aSpace1; /* Space for copies of dividers cells */
Pgno pgno; /* Temp var to store a page number in */
u8 abDone[NB+2]; /* True after i'th new page is populated */
Pgno aPgno[NB+2]; /* Page numbers of new pages before shuffling */
Pgno aPgOrder[NB+2]; /* Copy of aPgno[] used for sorting pages */
u16 aPgFlags[NB+2]; /* flags field of new pages before shuffling */
CellArray b; /* Parsed information on cells being balanced */
memset(abDone, 0, sizeof(abDone));
b.nCell = 0;
b.apCell = 0;
pBt = pParent->pBt;
assert( sqlite3_mutex_held(pBt->mutex) );
assert( sqlite3PagerIswriteable(pParent->pDbPage) );
/* At this point pParent may have at most one overflow cell. And if
** this overflow cell is present, it must be the cell with
** index iParentIdx. This scenario comes about when this function
** is called (indirectly) from sqlite3BtreeDelete().
*/
assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );
if( !aOvflSpace ){
return SQLITE_NOMEM_BKPT;
}
assert( pParent->nFree>=0 );
/* Find the sibling pages to balance. Also locate the cells in pParent
** that divide the siblings. An attempt is made to find NN siblings on
** either side of pPage. More siblings are taken from one side, however,
** if there are fewer than NN siblings on the other side. If pParent
** has NB or fewer children then all children of pParent are taken.
**
** This loop also drops the divider cells from the parent page. This
** way, the remainder of the function does not have to deal with any
** overflow cells in the parent page, since if any existed they will
** have already been removed.
*/
i = pParent->nOverflow + pParent->nCell;
if( i<2 ){
nxDiv = 0;
}else{
assert( bBulk==0 || bBulk==1 );
if( iParentIdx==0 ){
nxDiv = 0;
}else if( iParentIdx==i ){
nxDiv = i-2+bBulk;
}else{
nxDiv = iParentIdx-1;
}
i = 2-bBulk;
}
nOld = i+1;
if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
pRight = &pParent->aData[pParent->hdrOffset+8];
}else{
pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
}
pgno = get4byte(pRight);
while( 1 ){
rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0);
if( rc ){
memset(apOld, 0, (i+1)*sizeof(MemPage*));
goto balance_cleanup;
}
if( apOld[i]->nFree<0 ){
rc = btreeComputeFreeSpace(apOld[i]);
if( rc ){
memset(apOld, 0, (i)*sizeof(MemPage*));
goto balance_cleanup;
}
}
if( (i--)==0 ) break;
if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){
apDiv[i] = pParent->apOvfl[0];
pgno = get4byte(apDiv[i]);
szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
pParent->nOverflow = 0;
}else{
apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
pgno = get4byte(apDiv[i]);
szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
/* Drop the cell from the parent page. apDiv[i] still points to
** the cell within the parent, even though it has been dropped.
** This is safe because dropping a cell only overwrites the first
** four bytes of it, and this function does not need the first
** four bytes of the divider cell. So the pointer is safe to use
** later on.
**
** But not if we are in secure-delete mode. In secure-delete mode,
** the dropCell() routine will overwrite the entire cell with zeroes.
** In this case, temporarily copy the cell into the aOvflSpace[]
** buffer. It will be copied out again as soon as the aSpace[] buffer
** is allocated. */
if( pBt->btsFlags & BTS_FAST_SECURE ){
int iOff;
iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
if( (iOff+szNew[i])>(int)pBt->usableSize ){
rc = SQLITE_CORRUPT_BKPT;
memset(apOld, 0, (i+1)*sizeof(MemPage*));
goto balance_cleanup;
}else{
memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
}
}
dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
}
}
/* Make nMaxCells a multiple of 4 in order to preserve 8-byte
** alignment */
nMaxCells = nOld*(MX_CELL(pBt) + ArraySize(pParent->apOvfl));
nMaxCells = (nMaxCells + 3)&~3;
/*
** Allocate space for memory structures
*/
szScratch =
nMaxCells*sizeof(u8*) /* b.apCell */
+ nMaxCells*sizeof(u16) /* b.szCell */
+ pBt->pageSize; /* aSpace1 */
assert( szScratch<=7*(int)pBt->pageSize );
b.apCell = sqlite3StackAllocRaw(0, szScratch );
if( b.apCell==0 ){
rc = SQLITE_NOMEM_BKPT;
goto balance_cleanup;
}
b.szCell = (u16*)&b.apCell[nMaxCells];
aSpace1 = (u8*)&b.szCell[nMaxCells];
assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
/*
** Load pointers to all cells on sibling pages and the divider cells
** into the local b.apCell[] array. Make copies of the divider cells
** into space obtained from aSpace1[]. The divider cells have already
** been removed from pParent.
**
** If the siblings are on leaf pages, then the child pointers of the
** divider cells are stripped from the cells before they are copied
** into aSpace1[]. In this way, all cells in b.apCell[] are without
** child pointers. If siblings are not leaves, then all cell in
** b.apCell[] include child pointers. Either way, all cells in b.apCell[]
** are alike.
**
** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.
** leafData: 1 if pPage holds key+data and pParent holds only keys.
*/
b.pRef = apOld[0];
leafCorrection = b.pRef->leaf*4;
leafData = b.pRef->intKeyLeaf;
for(i=0; i<nOld; i++){
MemPage *pOld = apOld[i];
int limit = pOld->nCell;
u8 *aData = pOld->aData;
u16 maskPage = pOld->maskPage;
u8 *piCell = aData + pOld->cellOffset;
u8 *piEnd;
VVA_ONLY( int nCellAtStart = b.nCell; )
/* Verify that all sibling pages are of the same "type" (table-leaf,
** table-interior, index-leaf, or index-interior).
*/
if( pOld->aData[0]!=apOld[0]->aData[0] ){
rc = SQLITE_CORRUPT_BKPT;
goto balance_cleanup;
}
/* Load b.apCell[] with pointers to all cells in pOld. If pOld
** contains overflow cells, include them in the b.apCell[] array
** in the correct spot.
**
** Note that when there are multiple overflow cells, it is always the
** case that they are sequential and adjacent. This invariant arises
** because multiple overflows can only occurs when inserting divider
** cells into a parent on a prior balance, and divider cells are always
** adjacent and are inserted in order. There is an assert() tagged
** with "NOTE 1" in the overflow cell insertion loop to prove this
** invariant.
**
** This must be done in advance. Once the balance starts, the cell
** offset section of the btree page will be overwritten and we will no
** long be able to find the cells if a pointer to each cell is not saved
** first.
*/
memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*(limit+pOld->nOverflow));
if( pOld->nOverflow>0 ){
if( NEVER(limit<pOld->aiOvfl[0]) ){
rc = SQLITE_CORRUPT_BKPT;
goto balance_cleanup;
}
limit = pOld->aiOvfl[0];
for(j=0; j<limit; j++){
b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
piCell += 2;
b.nCell++;
}
for(k=0; k<pOld->nOverflow; k++){
assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */
b.apCell[b.nCell] = pOld->apOvfl[k];
b.nCell++;
}
}
piEnd = aData + pOld->cellOffset + 2*pOld->nCell;
while( piCell<piEnd ){
assert( b.nCell<nMaxCells );
b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
piCell += 2;
b.nCell++;
}
assert( (b.nCell-nCellAtStart)==(pOld->nCell+pOld->nOverflow) );
cntOld[i] = b.nCell;
if( i<nOld-1 && !leafData){
u16 sz = (u16)szNew[i];
u8 *pTemp;
assert( b.nCell<nMaxCells );
b.szCell[b.nCell] = sz;
pTemp = &aSpace1[iSpace1];
iSpace1 += sz;
assert( sz<=pBt->maxLocal+23 );
assert( iSpace1 <= (int)pBt->pageSize );
memcpy(pTemp, apDiv[i], sz);
b.apCell[b.nCell] = pTemp+leafCorrection;
assert( leafCorrection==0 || leafCorrection==4 );
b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection;
if( !pOld->leaf ){
assert( leafCorrection==0 );
assert( pOld->hdrOffset==0 );
/* The right pointer of the child page pOld becomes the left
** pointer of the divider cell */
memcpy(b.apCell[b.nCell], &pOld->aData[8], 4);
}else{
assert( leafCorrection==4 );
while( b.szCell[b.nCell]<4 ){
/* Do not allow any cells smaller than 4 bytes. If a smaller cell
** does exist, pad it with 0x00 bytes. */
assert( b.szCell[b.nCell]==3 || CORRUPT_DB );
assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB );
aSpace1[iSpace1++] = 0x00;
b.szCell[b.nCell]++;
}
}
b.nCell++;
}
}
/*
** Figure out the number of pages needed to hold all b.nCell cells.
** Store this number in "k". Also compute szNew[] which is the total
** size of all cells on the i-th page and cntNew[] which is the index
** in b.apCell[] of the cell that divides page i from page i+1.
** cntNew[k] should equal b.nCell.
**
** Values computed by this block:
**
** k: The total number of sibling pages
** szNew[i]: Spaced used on the i-th sibling page.
** cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to
** the right of the i-th sibling page.
** usableSpace: Number of bytes of space available on each sibling.
**
*/
usableSpace = pBt->usableSize - 12 + leafCorrection;
for(i=k=0; i<nOld; i++, k++){
MemPage *p = apOld[i];
b.apEnd[k] = p->aDataEnd;
b.ixNx[k] = cntOld[i];
if( k && b.ixNx[k]==b.ixNx[k-1] ){
k--; /* Omit b.ixNx[] entry for child pages with no cells */
}
if( !leafData ){
k++;
b.apEnd[k] = pParent->aDataEnd;
b.ixNx[k] = cntOld[i]+1;
}
assert( p->nFree>=0 );
szNew[i] = usableSpace - p->nFree;
for(j=0; j<p->nOverflow; j++){
szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);
}
cntNew[i] = cntOld[i];
}
k = nOld;
for(i=0; i<k; i++){
int sz;
while( szNew[i]>usableSpace ){
if( i+1>=k ){
k = i+2;
if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
szNew[k-1] = 0;
cntNew[k-1] = b.nCell;
}
sz = 2 + cachedCellSize(&b, cntNew[i]-1);
szNew[i] -= sz;
if( !leafData ){
if( cntNew[i]<b.nCell ){
sz = 2 + cachedCellSize(&b, cntNew[i]);
}else{
sz = 0;
}
}
szNew[i+1] += sz;
cntNew[i]--;
}
while( cntNew[i]<b.nCell ){
sz = 2 + cachedCellSize(&b, cntNew[i]);
if( szNew[i]+sz>usableSpace ) break;
szNew[i] += sz;
cntNew[i]++;
if( !leafData ){
if( cntNew[i]<b.nCell ){
sz = 2 + cachedCellSize(&b, cntNew[i]);
}else{
sz = 0;
}
}
szNew[i+1] -= sz;
}
if( cntNew[i]>=b.nCell ){
k = i+1;
}else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){
rc = SQLITE_CORRUPT_BKPT;
goto balance_cleanup;
}
}
/*
** The packing computed by the previous block is biased toward the siblings
** on the left side (siblings with smaller keys). The left siblings are
** always nearly full, while the right-most sibling might be nearly empty.
** The next block of code attempts to adjust the packing of siblings to
** get a better balance.
**
** This adjustment is more than an optimization. The packing above might
** be so out of balance as to be illegal. For example, the right-most
** sibling might be completely empty. This adjustment is not optional.
*/
for(i=k-1; i>0; i--){
int szRight = szNew[i]; /* Size of sibling on the right */
int szLeft = szNew[i-1]; /* Size of sibling on the left */
int r; /* Index of right-most cell in left sibling */
int d; /* Index of first cell to the left of right sibling */
r = cntNew[i-1] - 1;
d = r + 1 - leafData;
(void)cachedCellSize(&b, d);
do{
assert( d<nMaxCells );
assert( r<nMaxCells );
(void)cachedCellSize(&b, r);
if( szRight!=0
&& (bBulk || szRight+b.szCell[d]+2 > szLeft-(b.szCell[r]+(i==k-1?0:2)))){
break;
}
szRight += b.szCell[d] + 2;
szLeft -= b.szCell[r] + 2;
cntNew[i-1] = r;
r--;
d--;
}while( r>=0 );
szNew[i] = szRight;
szNew[i-1] = szLeft;
if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){
rc = SQLITE_CORRUPT_BKPT;
goto balance_cleanup;
}
}
/* Sanity check: For a non-corrupt database file one of the follwing
** must be true:
** (1) We found one or more cells (cntNew[0])>0), or
** (2) pPage is a virtual root page. A virtual root page is when
** the real root page is page 1 and we are the only child of
** that page.
*/
assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB);
TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n",
apOld[0]->pgno, apOld[0]->nCell,
nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,
nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0
));
/*
** Allocate k new pages. Reuse old pages where possible.
*/
pageFlags = apOld[0]->aData[0];
for(i=0; i<k; i++){
MemPage *pNew;
if( i<nOld ){
pNew = apNew[i] = apOld[i];
apOld[i] = 0;
rc = sqlite3PagerWrite(pNew->pDbPage);
nNew++;
if( rc ) goto balance_cleanup;
}else{
assert( i>0 );
rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
if( rc ) goto balance_cleanup;
zeroPage(pNew, pageFlags);
apNew[i] = pNew;
nNew++;
cntOld[i] = b.nCell;
/* Set the pointer-map entry for the new sibling page. */
if( ISAUTOVACUUM ){
ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
if( rc!=SQLITE_OK ){
goto balance_cleanup;
}
}
}
}
/*
** Reassign page numbers so that the new pages are in ascending order.
** This helps to keep entries in the disk file in order so that a scan
** of the table is closer to a linear scan through the file. That in turn
** helps the operating system to deliver pages from the disk more rapidly.
**
** An O(n^2) insertion sort algorithm is used, but since n is never more
** than (NB+2) (a small constant), that should not be a problem.
**
** When NB==3, this one optimization makes the database about 25% faster
** for large insertions and deletions.
*/
for(i=0; i<nNew; i++){
aPgOrder[i] = aPgno[i] = apNew[i]->pgno;
aPgFlags[i] = apNew[i]->pDbPage->flags;
for(j=0; j<i; j++){
if( aPgno[j]==aPgno[i] ){
/* This branch is taken if the set of sibling pages somehow contains
** duplicate entries. This can happen if the database is corrupt.
** It would be simpler to detect this as part of the loop below, but
** we do the detection here in order to avoid populating the pager
** cache with two separate objects associated with the same
** page number. */
assert( CORRUPT_DB );
rc = SQLITE_CORRUPT_BKPT;
goto balance_cleanup;
}
}
}
for(i=0; i<nNew; i++){
int iBest = 0; /* aPgno[] index of page number to use */
for(j=1; j<nNew; j++){
if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j;
}
pgno = aPgOrder[iBest];
aPgOrder[iBest] = 0xffffffff;
if( iBest!=i ){
if( iBest>i ){
sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0);
}
sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]);
apNew[i]->pgno = pgno;
}
}
TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "
"%d(%d nc=%d) %d(%d nc=%d)\n",
apNew[0]->pgno, szNew[0], cntNew[0],
nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,
nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,
nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,
nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,
nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0
));
assert( sqlite3PagerIswriteable(pParent->pDbPage) );
assert( nNew>=1 && nNew<=ArraySize(apNew) );
assert( apNew[nNew-1]!=0 );
put4byte(pRight, apNew[nNew-1]->pgno);
/* If the sibling pages are not leaves, ensure that the right-child pointer
** of the right-most new sibling page is set to the value that was
** originally in the same field of the right-most old sibling page. */
if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){
MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];
memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);
}
/* Make any required updates to pointer map entries associated with
** cells stored on sibling pages following the balance operation. Pointer
** map entries associated with divider cells are set by the insertCell()
** routine. The associated pointer map entries are:
**
** a) if the cell contains a reference to an overflow chain, the
** entry associated with the first page in the overflow chain, and
**
** b) if the sibling pages are not leaves, the child page associated
** with the cell.
**
** If the sibling pages are not leaves, then the pointer map entry
** associated with the right-child of each sibling may also need to be
** updated. This happens below, after the sibling pages have been
** populated, not here.
*/
if( ISAUTOVACUUM ){
MemPage *pOld;
MemPage *pNew = pOld = apNew[0];
int cntOldNext = pNew->nCell + pNew->nOverflow;
int iNew = 0;
int iOld = 0;
for(i=0; i<b.nCell; i++){
u8 *pCell = b.apCell[i];
while( i==cntOldNext ){
iOld++;
assert( iOld<nNew || iOld<nOld );
assert( iOld>=0 && iOld<NB );
pOld = iOld<nNew ? apNew[iOld] : apOld[iOld];
cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;
}
if( i==cntNew[iNew] ){
pNew = apNew[++iNew];
if( !leafData ) continue;
}
/* Cell pCell is destined for new sibling page pNew. Originally, it
** was either part of sibling page iOld (possibly an overflow cell),
** or else the divider cell to the left of sibling page iOld. So,
** if sibling page iOld had the same page number as pNew, and if
** pCell really was a part of sibling page iOld (not a divider or
** overflow cell), we can skip updating the pointer map entries. */
if( iOld>=nNew
|| pNew->pgno!=aPgno[iOld]
|| !SQLITE_WITHIN(pCell,pOld->aData,pOld->aDataEnd)
){
if( !leafCorrection ){
ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
}
if( cachedCellSize(&b,i)>pNew->minLocal ){
ptrmapPutOvflPtr(pNew, pOld, pCell, &rc);
}
if( rc ) goto balance_cleanup;
}
}
}
/* Insert new divider cells into pParent. */
for(i=0; i<nNew-1; i++){
u8 *pCell;
u8 *pTemp;
int sz;
MemPage *pNew = apNew[i];
j = cntNew[i];
assert( j<nMaxCells );
assert( b.apCell[j]!=0 );
pCell = b.apCell[j];
sz = b.szCell[j] + leafCorrection;
pTemp = &aOvflSpace[iOvflSpace];
if( !pNew->leaf ){
memcpy(&pNew->aData[8], pCell, 4);
}else if( leafData ){
/* If the tree is a leaf-data tree, and the siblings are leaves,
** then there is no divider cell in b.apCell[]. Instead, the divider
** cell consists of the integer key for the right-most cell of
** the sibling-page assembled above only.
*/
CellInfo info;
j--;
pNew->xParseCell(pNew, b.apCell[j], &info);
pCell = pTemp;
sz = 4 + putVarint(&pCell[4], info.nKey);
pTemp = 0;
}else{
pCell -= 4;
/* Obscure case for non-leaf-data trees: If the cell at pCell was
** previously stored on a leaf node, and its reported size was 4
** bytes, then it may actually be smaller than this
** (see btreeParseCellPtr(), 4 bytes is the minimum size of
** any cell). But it is important to pass the correct size to
** insertCell(), so reparse the cell now.
**
** This can only happen for b-trees used to evaluate "IN (SELECT ...)"
** and WITHOUT ROWID tables with exactly one column which is the
** primary key.
*/
if( b.szCell[j]==4 ){
assert(leafCorrection==4);
sz = pParent->xCellSize(pParent, pCell);
}
}
iOvflSpace += sz;
assert( sz<=pBt->maxLocal+23 );
assert( iOvflSpace <= (int)pBt->pageSize );
insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);
if( rc!=SQLITE_OK ) goto balance_cleanup;
assert( sqlite3PagerIswriteable(pParent->pDbPage) );
}
/* Now update the actual sibling pages. The order in which they are updated
** is important, as this code needs to avoid disrupting any page from which
** cells may still to be read. In practice, this means:
**
** (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])
** then it is not safe to update page apNew[iPg] until after
** the left-hand sibling apNew[iPg-1] has been updated.
**
** (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])
** then it is not safe to update page apNew[iPg] until after
** the right-hand sibling apNew[iPg+1] has been updated.
**
** If neither of the above apply, the page is safe to update.
**
** The iPg value in the following loop starts at nNew-1 goes down
** to 0, then back up to nNew-1 again, thus making two passes over
** the pages. On the initial downward pass, only condition (1) above
** needs to be tested because (2) will always be true from the previous
** step. On the upward pass, both conditions are always true, so the
** upwards pass simply processes pages that were missed on the downward
** pass.
*/
for(i=1-nNew; i<nNew; i++){
int iPg = i<0 ? -i : i;
assert( iPg>=0 && iPg<nNew );
if( abDone[iPg] ) continue; /* Skip pages already processed */
if( i>=0 /* On the upwards pass, or... */
|| cntOld[iPg-1]>=cntNew[iPg-1] /* Condition (1) is true */
){
int iNew;
int iOld;
int nNewCell;
/* Verify condition (1): If cells are moving left, update iPg
** only after iPg-1 has already been updated. */
assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] );
/* Verify condition (2): If cells are moving right, update iPg
** only after iPg+1 has already been updated. */
assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] );
if( iPg==0 ){
iNew = iOld = 0;
nNewCell = cntNew[0];
}else{
iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell;
iNew = cntNew[iPg-1] + !leafData;
nNewCell = cntNew[iPg] - iNew;
}
rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b);
if( rc ) goto balance_cleanup;
abDone[iPg]++;
apNew[iPg]->nFree = usableSpace-szNew[iPg];
assert( apNew[iPg]->nOverflow==0 );
assert( apNew[iPg]->nCell==nNewCell );
}
}
/* All pages have been processed exactly once */
assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );
assert( nOld>0 );
assert( nNew>0 );
if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
/* The root page of the b-tree now contains no cells. The only sibling
** page is the right-child of the parent. Copy the contents of the
** child page into the parent, decreasing the overall height of the
** b-tree structure by one. This is described as the "balance-shallower"
** sub-algorithm in some documentation.
**
** If this is an auto-vacuum database, the call to copyNodeContent()
** sets all pointer-map entries corresponding to database image pages
** for which the pointer is stored within the content being copied.
**
** It is critical that the child page be defragmented before being
** copied into the parent, because if the parent is page 1 then it will
** by smaller than the child due to the database header, and so all the
** free space needs to be up front.
*/
assert( nNew==1 || CORRUPT_DB );
rc = defragmentPage(apNew[0], -1);
testcase( rc!=SQLITE_OK );
assert( apNew[0]->nFree ==
(get2byteNotZero(&apNew[0]->aData[5]) - apNew[0]->cellOffset
- apNew[0]->nCell*2)
|| rc!=SQLITE_OK
);
copyNodeContent(apNew[0], pParent, &rc);
freePage(apNew[0], &rc);
}else if( ISAUTOVACUUM && !leafCorrection ){
/* Fix the pointer map entries associated with the right-child of each
** sibling page. All other pointer map entries have already been taken
** care of. */
for(i=0; i<nNew; i++){
u32 key = get4byte(&apNew[i]->aData[8]);
ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
}
}
assert( pParent->isInit );
TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
nOld, nNew, b.nCell));
/* Free any old pages that were not reused as new pages.
*/
for(i=nNew; i<nOld; i++){
freePage(apOld[i], &rc);
}
#if 0
if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){
/* The ptrmapCheckPages() contains assert() statements that verify that
** all pointer map pages are set correctly. This is helpful while
** debugging. This is usually disabled because a corrupt database may
** cause an assert() statement to fail. */
ptrmapCheckPages(apNew, nNew);
ptrmapCheckPages(&pParent, 1);
}
#endif
/*
** Cleanup before returning.
*/
balance_cleanup:
sqlite3StackFree(0, b.apCell);
for(i=0; i<nOld; i++){
releasePage(apOld[i]);
}
for(i=0; i<nNew; i++){
releasePage(apNew[i]);
}
return rc;
}