head	1.22;
access;
symbols
	REL9_0_0:1.22
	REL9_1_ALPHA1:1.22
	REL9_0_RC1:1.22
	REL9_0_BETA4:1.22
	REL9_0_STABLE:1.22.0.6
	REL9_0_BETA3:1.22
	REL9_0_BETA2:1.22
	REL7_4_29:1.7.4.1
	REL8_0_25:1.8.4.1
	REL8_1_21:1.8.6.1
	REL8_2_17:1.14
	REL8_3_11:1.18
	REL8_4_4:1.20
	REL9_0_BETA1:1.22
	REL9_0_ALPHA5_BRANCH:1.22.0.4
	REL9_0_ALPHA5:1.22
	REL7_4_28:1.7.4.1
	REL8_0_24:1.8.4.1
	REL8_1_20:1.8.6.1
	REL8_2_16:1.14
	REL8_3_10:1.18
	REL8_4_3:1.20
	REL9_0_ALPHA4:1.22
	REL9_0_ALPHA4_BRANCH:1.22.0.2
	REL8_5_ALPHA3:1.21
	REL8_5_ALPHA3_BRANCH:1.21.0.2
	REL7_4_27:1.7.4.1
	REL8_0_23:1.8.4.1
	REL8_1_19:1.8.6.1
	REL8_2_15:1.14
	REL8_3_9:1.18
	REL8_4_2:1.20
	REL8_5_ALPHA2:1.20
	REL8_5_ALPHA2_BRANCH:1.20.0.6
	REL7_4_26:1.7.4.1
	REL8_0_22:1.8.4.1
	REL8_1_18:1.8.6.1
	REL8_2_14:1.14
	REL8_3_8:1.18
	REL8_4_1:1.20
	REL8_5_ALPHA1:1.20
	REL8_5_ALPHA1_BRANCH:1.20.0.4
	REL8_4_STABLE:1.20.0.2
	REL8_4_0:1.20
	REL8_4_RC2:1.20
	REL8_4_RC1:1.20
	REL8_4_BETA2:1.20
	REL8_4_BETA1:1.20
	REL7_4_25:1.7.4.1
	REL8_0_21:1.8.4.1
	REL8_1_17:1.8.6.1
	REL8_2_13:1.14
	REL8_3_7:1.18
	REL7_4_24:1.7.4.1
	REL8_0_20:1.8.4.1
	REL8_1_16:1.8.6.1
	REL8_2_12:1.14
	REL8_3_6:1.18
	REL7_4_23:1.7.4.1
	REL8_0_19:1.8.4.1
	REL8_1_15:1.8.6.1
	REL8_2_11:1.14
	REL8_3_5:1.18
	REL7_4_22:1.7.4.1
	REL8_0_18:1.8.4.1
	REL8_1_14:1.8.6.1
	REL8_2_10:1.14
	REL8_3_4:1.18
	REL7_4_21:1.7.4.1
	REL8_0_17:1.8.4.1
	REL8_1_13:1.8.6.1
	REL8_2_9:1.14
	REL8_3_3:1.18
	REL7_4_20:1.7.4.1
	REL8_0_16:1.8.4.1
	REL8_1_12:1.8.6.1
	REL8_2_8:1.14
	REL8_3_2:1.18
	REL8_2_7:1.14
	REL8_3_1:1.18
	REL8_3_STABLE:1.18.0.2
	REL8_3_0:1.18
	REL8_3_RC2:1.18
	REL7_3_21:1.6
	REL7_4_19:1.7.4.1
	REL8_0_15:1.8.4.1
	REL8_1_11:1.8.6.1
	REL8_2_6:1.14
	REL8_3_RC1:1.18
	REL8_3_BETA4:1.18
	REL8_3_BETA3:1.18
	REL8_3_BETA2:1.18
	REL8_3_BETA1:1.18
	REL7_3_20:1.6
	REL7_4_18:1.7.4.1
	REL8_0_14:1.8.4.1
	REL8_1_10:1.8.6.1
	REL8_2_5:1.14
	REL7_3_19:1.6
	REL7_4_17:1.7.4.1
	REL8_0_13:1.8.4.1
	REL8_1_9:1.8.6.1
	REL8_2_4:1.14
	REL8_0_12:1.8.4.1
	REL8_1_8:1.8.6.1
	REL8_2_3:1.14
	REL7_3_18:1.6
	REL7_4_16:1.7.4.1
	REL8_0_11:1.8.4.1
	REL8_1_7:1.8.6.1
	REL8_2_2:1.14
	REL8_0_10:1.8.4.1
	REL8_1_6:1.8.6.1
	REL8_2_1:1.14
	REL7_4_15:1.7.4.1
	REL7_3_17:1.6
	REL8_2_STABLE:1.14.0.2
	REL8_2_0:1.14
	REL8_2_RC1:1.14
	REL8_2_BETA3:1.14
	REL8_2_BETA2:1.13
	REL8_1_5:1.8
	REL8_0_9:1.8
	REL7_4_14:1.7
	REL7_3_16:1.6
	REL8_2_BETA1:1.13
	REL7_3_15:1.6
	REL7_4_13:1.7
	REL8_0_8:1.8
	REL8_1_4:1.8
	REL7_3_14:1.6
	REL7_4_12:1.7
	REL8_0_7:1.8
	REL8_1_3:1.8
	REL7_3_13:1.6
	REL7_4_11:1.7
	REL8_0_6:1.8
	REL8_1_2:1.8
	REL7_3_12:1.6
	REL7_4_10:1.7
	REL8_0_5:1.8
	REL8_1_1:1.8
	REL8_1_STABLE:1.8.0.6
	REL8_1_0:1.8
	REL8_1_0RC1:1.8
	REL8_1_0BETA4:1.8
	REL8_1_0BETA3:1.8
	REL7_3_11:1.6
	REL7_4_9:1.7
	REL8_0_4:1.8
	REL8_1_0BETA2:1.8
	REL8_1_0BETA1:1.8
	REL7_2_8:1.5
	REL7_3_10:1.6
	REL7_4_8:1.7
	REL8_0_3:1.8
	REL8_0_2:1.8
	REL7_2_7:1.5
	REL7_3_9:1.6
	REL7_4_7:1.7
	REL8_0_1:1.8
	REL8_0_STABLE:1.8.0.4
	REL8_0_0:1.8.0.2
	REL8_0_0RC5:1.8
	REL8_0_0RC4:1.8
	REL8_0_0RC3:1.8
	REL8_0_0RC2:1.8
	REL8_0_0RC1:1.8
	REL8_0_0BETA5:1.8
	REL8_0_0BETA4:1.8
	REL7_4_6:1.7
	REL7_3_8:1.6
	REL7_2_6:1.5
	REL8_0_0BETA3:1.8
	REL8_0_0BETA2:1.8
	REL7_2_5:1.5
	REL7_4_5:1.7
	REL7_3_7:1.6
	REL7_4_4:1.7
	REL8_0_0BETA1:1.8
	REL7_4_3:1.7
	REL7_4_2:1.7
	REL7_3_6:1.6
	REL7_4_1:1.7
	REL7_3_5:1.6
	REL7_4:1.7
	REL7_4_RC2:1.7
	REL7_4_STABLE:1.7.0.4
	REL7_4_RC1:1.7
	REL7_4_BETA5:1.7
	REL7_4_BETA4:1.7
	REL7_4_BETA3:1.7
	REL7_4_BETA2:1.7
	WIN32_DEV:1.7.0.2
	REL7_4_BETA1:1.7
	REL7_3_4:1.6
	REL7_3_2:1.6
	REL7_2_4:1.5
	REL7_3_STABLE:1.6.0.2
	REL7_2_3:1.5
	REL7_2_STABLE:1.5.0.2
	REL7_2:1.5
	REL7_2_RC2:1.5
	REL7_2_RC1:1.5
	REL7_2_BETA5:1.5
	REL7_2_BETA4:1.5
	REL7_2_BETA3:1.5
	REL7_2_BETA2:1.5
	REL7_2_BETA1:1.5
	REL7_1_2:1.4
	REL7_1_STABLE:1.4.0.2
	REL7_1_BETA:1.4
	REL7_1_BETA3:1.4
	REL7_1_BETA2:1.4
	REL7_1:1.4
	REL7_0_PATCHES:1.1.1.1.0.10
	REL7_0:1.1.1.1
	REL6_5_PATCHES:1.1.1.1.0.8
	REL6_5:1.1.1.1
	REL6_4:1.1.1.1.0.6
	release-6-3:1.1.1.1
	REL2_0B:1.1.1.1.0.4
	REL2_0:1.1.1.1
	Release_2_0_0:1.1.1.1
	Release_1_0_3:1.1.1.1.0.2
	Release_2_0:1.1.1.1
	Release_1_0_2:1.1.1.1
	PG95-1_01:1.1.1.1
	PG95_DIST:1.1.1;
locks; strict;
comment	@# @;


1.22
date	2010.02.08.04.33.53;	author tgl;	state Exp;
branches;
next	1.21;

1.21
date	2009.12.19.01.32.32;	author sriggs;	state Exp;
branches;
next	1.20;

1.20
date	2008.03.21.13.23.27;	author momjian;	state Exp;
branches;
next	1.19;

1.19
date	2008.03.20.17.55.14;	author momjian;	state Exp;
branches;
next	1.18;

1.18
date	2007.09.12.22.10.26;	author tgl;	state Exp;
branches;
next	1.17;

1.17
date	2007.01.12.17.04.54;	author tgl;	state Exp;
branches;
next	1.16;

1.16
date	2007.01.09.02.14.10;	author tgl;	state Exp;
branches;
next	1.15;

1.15
date	2006.12.28.23.16.39;	author tgl;	state Exp;
branches;
next	1.14;

1.14
date	2006.11.01.19.43.17;	author tgl;	state Exp;
branches;
next	1.13;

1.13
date	2006.07.25.19.13.00;	author tgl;	state Exp;
branches;
next	1.12;

1.12
date	2006.05.08.00.00.09;	author tgl;	state Exp;
branches;
next	1.11;

1.11
date	2006.05.07.01.21.30;	author tgl;	state Exp;
branches;
next	1.10;

1.10
date	2006.04.25.22.46.05;	author tgl;	state Exp;
branches;
next	1.9;

1.9
date	2006.01.17.00.09.00;	author tgl;	state Exp;
branches;
next	1.8;

1.8
date	2003.11.29.19.51.40;	author pgsql;	state Exp;
branches
	1.8.4.1
	1.8.6.1;
next	1.7;

1.7
date	2003.02.21.00.06.21;	author tgl;	state Exp;
branches
	1.7.4.1;
next	1.6;

1.6
date	2002.10.20.20.47.31;	author tgl;	state Exp;
branches;
next	1.5;

1.5
date	2001.07.15.22.48.16;	author tgl;	state Exp;
branches;
next	1.4;

1.4
date	2000.07.25.05.26.40;	author tgl;	state Exp;
branches;
next	1.3;

1.3
date	2000.07.21.22.14.09;	author tgl;	state Exp;
branches;
next	1.2;

1.2
date	2000.07.21.06.42.32;	author tgl;	state Exp;
branches;
next	1.1;

1.1
date	96.07.09.06.21.12;	author scrappy;	state Exp;
branches
	1.1.1.1;
next	;

1.1.1.1
date	96.07.09.06.21.12;	author scrappy;	state Exp;
branches;
next	;

1.7.4.1
date	2006.11.01.19.50.14;	author tgl;	state Exp;
branches;
next	;

1.8.4.1
date	2006.11.01.19.50.08;	author tgl;	state Exp;
branches;
next	;

1.8.6.1
date	2006.11.01.19.50.03;	author tgl;	state Exp;
branches;
next	;


desc
@@


1.22
log
@Remove old-style VACUUM FULL (which was known for a little while as
VACUUM FULL INPLACE), along with a boatload of subsidiary code and complexity.
Per discussion, the use case for this method of vacuuming is no longer large
enough to justify maintaining it; not to mention that we don't wish to invest
the work that would be needed to make it play nicely with Hot Standby.

Aside from the code directly related to old-style VACUUM FULL, this commit
removes support for certain WAL record types that could only be generated
within VACUUM FULL, redirect-pointer removal in heap_page_prune, and
nontransactional generation of cache invalidation sinval messages (the last
being the sticking point for Hot Standby).

We still have to retain all code that copes with finding HEAP_MOVED_OFF and
HEAP_MOVED_IN flag bits on existing tuples.  This can't be removed as long
as we want to support in-place update from pre-9.0 databases.
@
text
@$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.21 2009/12/19 01:32:32 sriggs Exp $

Btree Indexing
==============

This directory contains a correct implementation of Lehman and Yao's
high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
Efficient Locking for Concurrent Operations on B-Trees, ACM Transactions
on Database Systems, Vol 6, No. 4, December 1981, pp 650-670).  We also
use a simplified version of the deletion logic described in Lanin and
Shasha (V. Lanin and D. Shasha, A Symmetric Concurrent B-Tree Algorithm,
Proceedings of 1986 Fall Joint Computer Conference, pp 380-389).

The Lehman and Yao Algorithm and Insertions
-------------------------------------------

We have made the following changes in order to incorporate the L&Y algorithm
into Postgres:

The requirement that all btree keys be unique is too onerous,
but the algorithm won't work correctly without it.  Fortunately, it is
only necessary that keys be unique on a single tree level, because L&Y
only use the assumption of key uniqueness when re-finding a key in a
parent page (to determine where to insert the key for a split page).
Therefore, we can use the link field to disambiguate multiple
occurrences of the same user key: only one entry in the parent level
will be pointing at the page we had split.  (Indeed we need not look at
the real "key" at all, just at the link field.)  We can distinguish
items at the leaf level in the same way, by examining their links to
heap tuples; we'd never have two items for the same heap tuple.

Lehman and Yao assume that the key range for a subtree S is described
by Ki < v <= Ki+1 where Ki and Ki+1 are the adjacent keys in the parent
page.  This does not work for nonunique keys (for example, if we have
enough equal keys to spread across several leaf pages, there *must* be
some equal bounding keys in the first level up).  Therefore we assume
Ki <= v <= Ki+1 instead.  A search that finds exact equality to a
bounding key in an upper tree level must descend to the left of that
key to ensure it finds any equal keys in the preceding page.  An
insertion that sees the high key of its target page is equal to the key
to be inserted has a choice whether or not to move right, since the new
key could go on either page.  (Currently, we try to find a page where
there is room for the new key without a split.)

Lehman and Yao don't require read locks, but assume that in-memory
copies of tree pages are unshared.  Postgres shares in-memory buffers
among backends.  As a result, we do page-level read locking on btree
pages in order to guarantee that no record is modified while we are
examining it.  This reduces concurrency but guaranteees correct
behavior.  An advantage is that when trading in a read lock for a
write lock, we need not re-read the page after getting the write lock.
Since we're also holding a pin on the shared buffer containing the
page, we know that buffer still contains the page and is up-to-date.

We support the notion of an ordered "scan" of an index as well as
insertions, deletions, and simple lookups.  A scan in the forward
direction is no problem, we just use the right-sibling pointers that
L&Y require anyway.  (Thus, once we have descended the tree to the
correct start point for the scan, the scan looks only at leaf pages
and never at higher tree levels.)  To support scans in the backward
direction, we also store a "left sibling" link much like the "right
sibling".  (This adds an extra step to the L&Y split algorithm: while
holding the write lock on the page being split, we also lock its former
right sibling to update that page's left-link.  This is safe since no
writer of that page can be interested in acquiring a write lock on our
page.)  A backwards scan has one additional bit of complexity: after
following the left-link we must account for the possibility that the
left sibling page got split before we could read it.  So, we have to
move right until we find a page whose right-link matches the page we
came from.  (Actually, it's even harder than that; see deletion discussion
below.)

Page read locks are held only for as long as a scan is examining a page.
To minimize lock/unlock traffic, an index scan always searches a leaf page
to identify all the matching items at once, copying their heap tuple IDs
into backend-local storage.  The heap tuple IDs are then processed while
not holding any page lock within the index.  We do continue to hold a pin
on the leaf page, to protect against concurrent deletions (see below).
In this state the scan is effectively stopped "between" pages, either
before or after the page it has pinned.  This is safe in the presence of
concurrent insertions and even page splits, because items are never moved
across pre-existing page boundaries --- so the scan cannot miss any items
it should have seen, nor accidentally return the same item twice.  The scan
must remember the page's right-link at the time it was scanned, since that
is the page to move right to; if we move right to the current right-link
then we'd re-scan any items moved by a page split.  We don't similarly
remember the left-link, since it's best to use the most up-to-date
left-link when trying to move left (see detailed move-left algorithm below).

In most cases we release our lock and pin on a page before attempting
to acquire pin and lock on the page we are moving to.  In a few places
it is necessary to lock the next page before releasing the current one.
This is safe when moving right or up, but not when moving left or down
(else we'd create the possibility of deadlocks).

Lehman and Yao fail to discuss what must happen when the root page
becomes full and must be split.  Our implementation is to split the
root in the same way that any other page would be split, then construct
a new root page holding pointers to both of the resulting pages (which
now become siblings on the next level of the tree).  The new root page
is then installed by altering the root pointer in the meta-data page (see
below).  This works because the root is not treated specially in any
other way --- in particular, searches will move right using its link
pointer if the link is set.  Therefore, searches will find the data
that's been moved into the right sibling even if they read the meta-data
page before it got updated.  This is the same reasoning that makes a
split of a non-root page safe.  The locking considerations are similar too.

When an inserter recurses up the tree, splitting internal pages to insert
links to pages inserted on the level below, it is possible that it will
need to access a page above the level that was the root when it began its
descent (or more accurately, the level that was the root when it read the
meta-data page).  In this case the stack it made while descending does not
help for finding the correct page.  When this happens, we find the correct
place by re-descending the tree until we reach the level one above the
level we need to insert a link to, and then moving right as necessary.
(Typically this will take only two fetches, the meta-data page and the new
root, but in principle there could have been more than one root split
since we saw the root.  We can identify the correct tree level by means of
the level numbers stored in each page.  The situation is rare enough that
we do not need a more efficient solution.)

Lehman and Yao assume fixed-size keys, but we must deal with
variable-size keys.  Therefore there is not a fixed maximum number of
keys per page; we just stuff in as many as will fit.  When we split a
page, we try to equalize the number of bytes, not items, assigned to
each of the resulting pages.  Note we must include the incoming item in
this calculation, otherwise it is possible to find that the incoming
item doesn't fit on the split page where it needs to go!

The Deletion Algorithm
----------------------

Before deleting a leaf item, we get a super-exclusive lock on the target
page, so that no other backend has a pin on the page when the deletion
starts.  This is not necessary for correctness in terms of the btree index
operations themselves; as explained above, index scans logically stop
"between" pages and so can't lose their place.  The reason we do it is to
provide an interlock between non-full VACUUM and indexscans.  Since VACUUM
deletes index entries before deleting tuples, the super-exclusive lock
guarantees that VACUUM can't delete any heap tuple that an indexscanning
process might be about to visit.  (This guarantee works only for simple
indexscans that visit the heap in sync with the index scan, not for bitmap
scans.  We only need the guarantee when using non-MVCC snapshot rules such
as SnapshotNow, so in practice this is only important for system catalog
accesses.)

Because a page can be split even while someone holds a pin on it, it is
possible that an indexscan will return items that are no longer stored on
the page it has a pin on, but rather somewhere to the right of that page.
To ensure that VACUUM can't prematurely remove such heap tuples, we require
btbulkdelete to obtain super-exclusive lock on every leaf page in the index,
even pages that don't contain any deletable tuples.  This guarantees that
the btbulkdelete call cannot return while any indexscan is still holding
a copy of a deleted index tuple.  Note that this requirement does not say
that btbulkdelete must visit the pages in any particular order.  (See also
on-the-fly deletion, below.) 

There is no such interlocking for deletion of items in internal pages,
since backends keep no lock nor pin on a page they have descended past.
Hence, when a backend is ascending the tree using its stack, it must
be prepared for the possibility that the item it wants is to the left of
the recorded position (but it can't have moved left out of the recorded
page).  Since we hold a lock on the lower page (per L&Y) until we have
re-found the parent item that links to it, we can be assured that the
parent item does still exist and can't have been deleted.  Also, because
we are matching downlink page numbers and not data keys, we don't have any
problem with possibly misidentifying the parent item.

We consider deleting an entire page from the btree only when it's become
completely empty of items.  (Merging partly-full pages would allow better
space reuse, but it seems impractical to move existing data items left or
right to make this happen --- a scan moving in the opposite direction
might miss the items if so.)  Also, we *never* delete the rightmost page
on a tree level (this restriction simplifies the traversal algorithms, as
explained below).

To delete an empty page, we acquire write lock on its left sibling (if
any), the target page itself, the right sibling (there must be one), and
the parent page, in that order.  The parent page must be found using the
same type of search as used to find the parent during an insertion split.
Then we update the side-links in the siblings, mark the target page
deleted, and remove the downlink from the parent, as well as the parent's
upper bounding key for the target (the one separating it from its right
sibling).  This causes the target page's key space to effectively belong
to its right sibling.  (Neither the left nor right sibling pages need to
change their "high key" if any; so there is no problem with possibly not
having enough space to replace a high key.)  The side-links in the target
page are not changed.

(Note: Lanin and Shasha prefer to make the key space move left, but their
argument for doing so hinges on not having left-links, which we have
anyway.  So we simplify the algorithm by moving key space right.)

To preserve consistency on the parent level, we cannot merge the key space
of a page into its right sibling unless the right sibling is a child of
the same parent --- otherwise, the parent's key space assignment changes
too, meaning we'd have to make bounding-key updates in its parent, and
perhaps all the way up the tree.  Since we can't possibly do that
atomically, we forbid this case.  That means that the rightmost child of a
parent node can't be deleted unless it's the only remaining child.

When we delete the last remaining child of a parent page, we mark the
parent page "half-dead" as part of the atomic update that deletes the
child page.  This implicitly transfers the parent's key space to its right
sibling (which it must have, since we never delete the overall-rightmost
page of a level).  Searches ignore the half-dead page and immediately move
right.  We need not worry about insertions into a half-dead page --- insertions
into upper tree levels happen only as a result of splits of child pages, and
the half-dead page no longer has any children that could split.  Therefore
the page stays empty even when we don't have lock on it, and we can complete
its deletion in a second atomic action.

The notion of a half-dead page means that the key space relationship between
the half-dead page's level and its parent's level may be a little out of
whack: key space that appears to belong to the half-dead page's parent on the
parent level may really belong to its right sibling.  To prevent any possible
problems, we hold lock on the deleted child page until we have finished
deleting any now-half-dead parent page(s).  This prevents any insertions into
the transferred keyspace until the operation is complete.  The reason for
doing this is that a sufficiently large number of insertions into the
transferred keyspace, resulting in multiple page splits, could propagate keys
from that keyspace into the parent level, resulting in transiently
out-of-order keys in that level.  It is thought that that wouldn't cause any
serious problem, but it seems too risky to allow.

A deleted page cannot be reclaimed immediately, since there may be other
processes waiting to reference it (ie, search processes that just left the
parent, or scans moving right or left from one of the siblings).  These
processes must observe that the page is marked dead and recover
accordingly.  Searches and forward scans simply follow the right-link
until they find a non-dead page --- this will be where the deleted page's
key-space moved to.

Moving left in a backward scan is complicated because we must consider
the possibility that the left sibling was just split (meaning we must find
the rightmost page derived from the left sibling), plus the possibility
that the page we were just on has now been deleted and hence isn't in the
sibling chain at all anymore.  So the move-left algorithm becomes:
0. Remember the page we are on as the "original page".
1. Follow the original page's left-link (we're done if this is zero).
2. If the current page is live and its right-link matches the "original
   page", we are done.
3. Otherwise, move right one or more times looking for a live page whose
   right-link matches the "original page".  If found, we are done.  (In
   principle we could scan all the way to the right end of the index, but
   in practice it seems better to give up after a small number of tries.
   It's unlikely the original page's sibling split more than a few times
   while we were in flight to it; if we do not find a matching link in a
   few tries, then most likely the original page is deleted.)
4. Return to the "original page".  If it is still live, return to step 1
   (we guessed wrong about it being deleted, and should restart with its
   current left-link).  If it is dead, move right until a non-dead page
   is found (there must be one, since rightmost pages are never deleted),
   mark that as the new "original page", and return to step 1.
This algorithm is correct because the live page found by step 4 will have
the same left keyspace boundary as the page we started from.  Therefore,
when we ultimately exit, it must be on a page whose right keyspace
boundary matches the left boundary of where we started --- which is what
we need to be sure we don't miss or re-scan any items.

A deleted page can only be reclaimed once there is no scan or search that
has a reference to it; until then, it must stay in place with its
right-link undisturbed.  We implement this by waiting until all
transactions that were running at the time of deletion are dead; which is
overly strong, but is simple to implement within Postgres.  When marked
dead, a deleted page is labeled with the next-transaction counter value.
VACUUM can reclaim the page for re-use when this transaction number is
older than the oldest open transaction.

Reclaiming a page doesn't actually change its state on disk --- we simply
record it in the shared-memory free space map, from which it will be
handed out the next time a new page is needed for a page split.  The
deleted page's contents will be overwritten by the split operation.
(Note: if we find a deleted page with an extremely old transaction
number, it'd be worthwhile to re-mark it with FrozenTransactionId so that
a later xid wraparound can't cause us to think the page is unreclaimable.
But in more normal situations this would be a waste of a disk write.)

Because we never delete the rightmost page of any level (and in particular
never delete the root), it's impossible for the height of the tree to
decrease.  After massive deletions we might have a scenario in which the
tree is "skinny", with several single-page levels below the root.
Operations will still be correct in this case, but we'd waste cycles
descending through the single-page levels.  To handle this we use an idea
from Lanin and Shasha: we keep track of the "fast root" level, which is
the lowest single-page level.  The meta-data page keeps a pointer to this
level as well as the true root.  All ordinary operations initiate their
searches at the fast root not the true root.  When we split a page that is
alone on its level or delete the next-to-last page on a level (both cases
are easily detected), we have to make sure that the fast root pointer is
adjusted appropriately.  In the split case, we do this work as part of the
atomic update for the insertion into the parent level; in the delete case
as part of the atomic update for the delete (either way, the metapage has
to be the last page locked in the update to avoid deadlock risks).  This
avoids race conditions if two such operations are executing concurrently.

VACUUM needs to do a linear scan of an index to search for deleted pages
that can be reclaimed because they are older than all open transactions.
For efficiency's sake, we'd like to use the same linear scan to search for
deletable tuples.  Before Postgres 8.2, btbulkdelete scanned the leaf pages
in index order, but it is possible to visit them in physical order instead.
The tricky part of this is to avoid missing any deletable tuples in the
presence of concurrent page splits: a page split could easily move some
tuples from a page not yet passed over by the sequential scan to a
lower-numbered page already passed over.  (This wasn't a concern for the
index-order scan, because splits always split right.)  To implement this,
we provide a "vacuum cycle ID" mechanism that makes it possible to
determine whether a page has been split since the current btbulkdelete
cycle started.  If btbulkdelete finds a page that has been split since
it started, and has a right-link pointing to a lower page number, then
it temporarily suspends its sequential scan and visits that page instead.
It must continue to follow right-links and vacuum dead tuples until
reaching a page that either hasn't been split since btbulkdelete started,
or is above the location of the outer sequential scan.  Then it can resume
the sequential scan.  This ensures that all tuples are visited.  It may be
that some tuples are visited twice, but that has no worse effect than an
inaccurate index tuple count (and we can't guarantee an accurate count
anyway in the face of concurrent activity).  Note that this still works
if the has-been-recently-split test has a small probability of false
positives, so long as it never gives a false negative.  This makes it
possible to implement the test with a small counter value stored on each
index page.

On-the-Fly Deletion Of Index Tuples
-----------------------------------

If a process visits a heap tuple and finds that it's dead and removable
(ie, dead to all open transactions, not only that process), then we can
return to the index and mark the corresponding index entry "known dead",
allowing subsequent index scans to skip visiting the heap tuple.  The
"known dead" marking works by setting the index item's lp_flags state
to LP_DEAD.  This is currently only done in plain indexscans, not bitmap
scans, because only plain scans visit the heap and index "in sync" and so
there's not a convenient way to do it for bitmap scans.

Once an index tuple has been marked LP_DEAD it can actually be removed
from the index immediately; since index scans only stop "between" pages,
no scan can lose its place from such a deletion.  We separate the steps
because we allow LP_DEAD to be set with only a share lock (it's exactly
like a hint bit for a heap tuple), but physically removing tuples requires
exclusive lock.  In the current code we try to remove LP_DEAD tuples when
we are otherwise faced with having to split a page to do an insertion (and
hence have exclusive lock on it already).

This leaves the index in a state where it has no entry for a dead tuple
that still exists in the heap.  This is not a problem for the current
implementation of VACUUM, but it could be a problem for anything that
explicitly tries to find index entries for dead tuples.  (However, the
same situation is created by REINDEX, since it doesn't enter dead
tuples into the index.)

It's sufficient to have an exclusive lock on the index page, not a
super-exclusive lock, to do deletion of LP_DEAD items.  It might seem
that this breaks the interlock between VACUUM and indexscans, but that is
not so: as long as an indexscanning process has a pin on the page where
the index item used to be, VACUUM cannot complete its btbulkdelete scan
and so cannot remove the heap tuple.  This is another reason why
btbulkdelete has to get super-exclusive lock on every leaf page, not only
the ones where it actually sees items to delete.

WAL Considerations
------------------

The insertion and deletion algorithms in themselves don't guarantee btree
consistency after a crash.  To provide robustness, we depend on WAL
replay.  A single WAL entry is effectively an atomic action --- we can
redo it from the log if it fails to complete.

Ordinary item insertions (that don't force a page split) are of course
single WAL entries, since they only affect one page.  The same for
leaf-item deletions (if the deletion brings the leaf page to zero items,
it is now a candidate to be deleted, but that is a separate action).

An insertion that causes a page split is logged as a single WAL entry for
the changes occuring on the insertion's level --- including update of the
right sibling's left-link --- followed by a second WAL entry for the
insertion on the parent level (which might itself be a page split, requiring
an additional insertion above that, etc).

For a root split, the followon WAL entry is a "new root" entry rather than
an "insertion" entry, but details are otherwise much the same.

Because insertion involves multiple atomic actions, the WAL replay logic
has to detect the case where a page split isn't followed by a matching
insertion on the parent level, and then do that insertion on its own (and
recursively for any subsequent parent insertion, of course).  This is
feasible because the WAL entry for the split contains enough info to know
what must be inserted in the parent level.

When splitting a non-root page that is alone on its level, the required
metapage update (of the "fast root" link) is performed and logged as part
of the insertion into the parent level.  When splitting the root page, the
metapage update is handled as part of the "new root" action.

A page deletion is logged as a single WAL entry covering all four
required page updates (target page, left and right siblings, and parent)
as an atomic event.  (Any required fast-root link update is also part
of the WAL entry.)  If the parent page becomes half-dead but is not
immediately deleted due to a subsequent crash, there is no loss of
consistency, and the empty page will be picked up by the next VACUUM.

Scans during Recovery
---------------------

The btree index type can be safely used during recovery. During recovery
we have at most one writer and potentially many readers. In that
situation the locking requirements can be relaxed and we do not need
double locking during block splits. Each WAL record makes changes to a
single level of the btree using the correct locking sequence and so
is safe for concurrent readers. Some readers may observe a block split
in progress as they descend the tree, but they will simply move right
onto the correct page.

During recovery all index scans start with ignore_killed_tuples = false
and we never set kill_prior_tuple. We do this because the oldest xmin
on the standby server can be older than the oldest xmin on the master
server, which means tuples can be marked as killed even when they are
still visible on the standby. We don't WAL log tuple killed bits, but
they can still appear in the standby because of full page writes. So
we must always ignore them in standby, and that means it's not worth
setting them either.

Note that we talk about scans that are started during recovery. We go to
a little trouble to allow a scan to start during recovery and end during
normal running after recovery has completed. This is a key capability
because it allows running applications to continue while the standby
changes state into a normally running server.

Other Things That Are Handy to Know
-----------------------------------

Page zero of every btree is a meta-data page.  This page stores the
location of the root page --- both the true root and the current effective
root ("fast" root).  To avoid fetching the metapage for every single index
search, we cache a copy of the meta-data information in the index's
relcache entry (rd_amcache).  This is a bit ticklish since using the cache
implies following a root page pointer that could be stale.  We require
every metapage update to send out a SI "relcache inval" message on the
index relation.  That ensures that each backend will flush its cached copy
not later than the start of its next transaction.  Therefore, stale
pointers cannot be used for longer than the current transaction, which
reduces the problem to the same one already dealt with for concurrent
VACUUM --- we can just imagine that each open transaction is potentially
"already in flight" to the old root.

The algorithm assumes we can fit at least three items per page
(a "high key" and two real data items).  Therefore it's unsafe
to accept items larger than 1/3rd page size.  Larger items would
work sometimes, but could cause failures later on depending on
what else gets put on their page.

"ScanKey" data structures are used in two fundamentally different ways
in this code, which we describe as "search" scankeys and "insertion"
scankeys.  A search scankey is the kind passed to btbeginscan() or
btrescan() from outside the btree code.  The sk_func pointers in a search
scankey point to comparison functions that return boolean, such as int4lt.
There might be more than one scankey entry for a given index column, or
none at all.  (We require the keys to appear in index column order, but
the order of multiple keys for a given column is unspecified.)  An
insertion scankey uses the same array-of-ScanKey data structure, but the
sk_func pointers point to btree comparison support functions (ie, 3-way
comparators that return int4 values interpreted as <0, =0, >0).  In an
insertion scankey there is exactly one entry per index column.  Insertion
scankeys are built within the btree code (eg, by _bt_mkscankey()) and are
used to locate the starting point of a scan, as well as for locating the
place to insert a new index tuple.  (Note: in the case of an insertion
scankey built from a search scankey, there might be fewer keys than
index columns, indicating that we have no constraints for the remaining
index columns.)  After we have located the starting point of a scan, the
original search scankey is consulted as each index entry is sequentially
scanned to decide whether to return the entry and whether the scan can
stop (see _bt_checkkeys()).

Notes About Data Representation
-------------------------------

The right-sibling link required by L&Y is kept in the page "opaque
data" area, as is the left-sibling link, the page level, and some flags.
The page level counts upwards from zero at the leaf level, to the tree
depth minus 1 at the root.  (Counting up from the leaves ensures that we
don't need to renumber any existing pages when splitting the root.)

The Postgres disk block data format (an array of items) doesn't fit
Lehman and Yao's alternating-keys-and-pointers notion of a disk page,
so we have to play some games.

On a page that is not rightmost in its tree level, the "high key" is
kept in the page's first item, and real data items start at item 2.
The link portion of the "high key" item goes unused.  A page that is
rightmost has no "high key", so data items start with the first item.
Putting the high key at the left, rather than the right, may seem odd,
but it avoids moving the high key as we add data items.

On a leaf page, the data items are simply links to (TIDs of) tuples
in the relation being indexed, with the associated key values.

On a non-leaf page, the data items are down-links to child pages with
bounding keys.  The key in each data item is the *lower* bound for
keys on that child page, so logically the key is to the left of that
downlink.  The high key (if present) is the upper bound for the last
downlink.  The first data item on each such page has no lower bound
--- or lower bound of minus infinity, if you prefer.  The comparison
routines must treat it accordingly.  The actual key stored in the
item is irrelevant, and need not be stored at all.  This arrangement
corresponds to the fact that an L&Y non-leaf page has one more pointer
than key.

Notes to Operator Class Implementors
------------------------------------

With this implementation, we require each supported combination of
datatypes to supply us with a comparison procedure via pg_amproc.
This procedure must take two nonnull values A and B and return an int32 < 0,
0, or > 0 if A < B, A = B, or A > B, respectively.  The procedure must
not return INT_MIN for "A < B", since the value may be negated before
being tested for sign.  A null result is disallowed, too.  See nbtcompare.c
for examples.

There are some basic assumptions that a btree operator family must satisfy:

An = operator must be an equivalence relation; that is, for all non-null
values A,B,C of the datatype:

	A = A is true						reflexive law
	if A = B, then B = A					symmetric law
	if A = B and B = C, then A = C				transitive law

A < operator must be a strong ordering relation; that is, for all non-null
values A,B,C:

	A < A is false						irreflexive law
	if A < B and B < C, then A < C				transitive law

Furthermore, the ordering is total; that is, for all non-null values A,B:

	exactly one of A < B, A = B, and B < A is true		trichotomy law

(The trichotomy law justifies the definition of the comparison support
procedure, of course.)

The other three operators are defined in terms of these two in the obvious way,
and must act consistently with them.

For an operator family supporting multiple datatypes, the above laws must hold
when A,B,C are taken from any datatypes in the family.  The transitive laws
are the trickiest to ensure, as in cross-type situations they represent
statements that the behaviors of two or three different operators are
consistent.  As an example, it would not work to put float8 and numeric into
an opfamily, at least not with the current semantics that numerics are
converted to float8 for comparison to a float8.  Because of the limited
accuracy of float8, this means there are distinct numeric values that will
compare equal to the same float8 value, and thus the transitive law fails.

It should be fairly clear why a btree index requires these laws to hold within
a single datatype: without them there is no ordering to arrange the keys with.
Also, index searches using a key of a different datatype require comparisons
to behave sanely across two datatypes.  The extensions to three or more
datatypes within a family are not strictly required by the btree index
mechanism itself, but the planner relies on them for optimization purposes.
@


1.21
log
@Allow read only connections during recovery, known as Hot Standby.

Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.

New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.

This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.

Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.

Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.20 2008/03/21 13:23:27 momjian Exp $
d174 3
a176 3
might miss the items if so.  We could do it during VACUUM FULL, though.)
Also, we *never* delete the rightmost page on a tree level (this
restriction simplifies the traversal algorithms, as explained below).
d269 1
a269 2
older than the oldest open transaction.  (NOTE: VACUUM FULL can reclaim
such pages immediately.)
@


1.20
log
@More README src cleanups.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.19 2008/03/20 17:55:14 momjian Exp $
d404 27
@


1.19
log
@Make source code READMEs more consistent.  Add CVS tags to all README files.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.18 2007/09/12 22:10:26 tgl Exp $
d4 1
a4 1
--------------
@


1.18
log
@Redefine the lp_flags field of item pointers as having four states, rather
than two independent bits (one of which was never used in heap pages anyway,
or at least hadn't been in a very long time).  This gives us flexibility to
add the HOT notions of redirected and dead item pointers without requiring
anything so klugy as magic values of lp_off and lp_len.  The state values
are chosen so that for the states currently in use (pre-HOT) there is no
change in the physical representation.
@
text
@d1 4
a4 1
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.17 2007/01/12 17:04:54 tgl Exp $
d14 1
a14 1
The Lehman and Yao algorithm and insertions
d131 1
a131 1
The deletion algorithm
d326 1
a326 1
On-the-fly deletion of index tuples
d363 1
a363 1
WAL considerations
d404 1
a404 1
Other things that are handy to know
d449 1
a449 1
Notes about data representation
d483 1
a483 1
Notes to operator class implementors
@


1.17
log
@Add some notes about the basic mathematical laws that the system presumes
hold true for operators in a btree operator family.  This is mostly to
clarify my own thinking about what the planner can assume for optimization
purposes.  (blowing dust off an old abstract-algebra textbook...)
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.16 2007/01/09 02:14:10 tgl Exp $
d330 4
a333 4
"known dead" marking uses the LP_DELETE bit in ItemIds.  This is currently
only done in plain indexscans, not bitmap scans, because only plain scans
visit the heap and index "in sync" and so there's not a convenient way
to do it for bitmap scans.
d335 1
a335 1
Once an index tuple has been marked LP_DELETE it can actually be removed
d338 1
a338 1
because we allow LP_DELETE to be set with only a share lock (it's exactly
d340 1
a340 1
exclusive lock.  In the current code we try to remove LP_DELETE tuples when
d352 1
a352 1
super-exclusive lock, to do deletion of LP_DELETE items.  It might seem
@


1.16
log
@Support ORDER BY ... NULLS FIRST/LAST, and add ASC/DESC/NULLS FIRST/NULLS LAST
per-column options for btree indexes.  The planner's support for this is still
pretty rudimentary; it does not yet know how to plan mergejoins with
nondefault ordering options.  The documentation is pretty rudimentary, too.
I'll work on improving that stuff later.

Note incompatible change from prior behavior: ORDER BY ... USING will now be
rejected if the operator is not a less-than or greater-than member of some
btree opclass.  This prevents less-than-sane behavior if an operator that
doesn't actually define a proper sort ordering is selected.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.15 2006/12/28 23:16:39 tgl Exp $
d488 44
a531 1
being tested for sign.  See nbtcompare.c for examples.
@


1.15
log
@Fix up btree's initial scankey processing to be able to detect redundant
or contradictory keys even in cross-data-type scenarios.  This is another
benefit of the opfamily rewrite: we can find the needed comparison
operators now.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.14 2006/11/01 19:43:17 tgl Exp $
d486 3
a488 2
0, or > 0 if A < B, A = B, or A > B, respectively.  See nbtcompare.c for
examples.
@


1.14
log
@Fix "failed to re-find parent key" btree VACUUM failure by revising page
deletion code to avoid the case where an upper-level btree page remains "half
dead" for a significant period of time, and to block insertions into a key
range that is in process of being re-assigned to the right sibling of the
deleted page's parent.  This prevents the scenario reported by Ed L. wherein
index keys could become out-of-order in the grandparent index level.

Since this is a moderately invasive fix, I'm applying it only to HEAD.
The bug exists back to 7.4, but the back branches will get a different patch.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.13 2006/07/25 19:13:00 tgl Exp $
d483 5
a487 4
With this implementation, we require each supported datatype to supply
us with a comparison procedure via pg_amproc.  This procedure must take
two nonnull values A and B and return an int32 < 0, 0, or > 0 if A < B,
A = B, or A > B, respectively.  See nbtcompare.c for examples.
@


1.13
log
@Modify btree to delete known-dead index entries without an actual VACUUM.
When we are about to split an index page to do an insertion, first look
to see if any entries marked LP_DELETE exist on the page, and if so remove
them to try to make enough space for the desired insert.  This should reduce
index bloat in heavily-updated tables, although of course you still need
VACUUM eventually to clean up the heap.

Junji Teramoto
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.12 2006/05/08 00:00:09 tgl Exp $
d204 6
a209 11
page of a level).  No future insertions into the parent level are allowed
to insert keys into the half-dead page --- they must move right to its
sibling, instead.  The parent remains empty and can be deleted in a
separate atomic action.  (However, if it's the rightmost child of its own
parent, it might have to stay half-dead for awhile, until it's also the
only child.)

Note that an empty leaf page is a valid tree state, but an empty interior
page is not legal (an interior page must have children to delegate its
key space to).  So an interior page *must* be marked half-dead as soon
as its last child is deleted.
d214 9
a222 5
parent level may really belong to its right sibling.  We can tolerate this,
however, because insertions and deletions on upper tree levels are always
done by reference to child page numbers, not keys.  The only cost is that
searches may sometimes descend to the half-dead page and then have to move
right, rather than going directly to the sibling page.
@


1.12
log
@Rewrite btree vacuuming to fold the former bulkdelete and cleanup operations
into a single mostly-physical-order scan of the index.  This requires some
ticklish interlocking considerations, but should create no material
performance impact on normal index operations (at least given the
already-committed changes to make scans work a page at a time).  VACUUM
itself should get significantly faster in any index that's degenerated to a
very nonlinear page order.  Also, we save one pass over the index entirely,
except in the case where there were no deletions to do and so only one pass
happened anyway.

Original patch by Heikki Linnakangas, rework by Tom Lane.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.11 2006/05/07 01:21:30 tgl Exp $
d149 2
a150 2
btbulkdelete to obtain super-exclusive lock on every leaf page in the index
(even pages that don't contain any deletable tuples).  This guarantees that
d153 2
a154 1
that btbulkdelete must visit the pages in any particular order.
d324 37
@


1.11
log
@Rewrite btree index scans to work a page at a time in all cases (both
btgettuple and btgetmulti).  This eliminates the problem of "re-finding" the
exact stopping point, since the stopping point is effectively always a page
boundary, and index items are never moved across pre-existing page boundaries.
A small penalty is that the keys_are_unique optimization is effectively
disabled (and, therefore, is removed in this patch), causing us to apply
_bt_checkkeys() to at least one more tuple than necessary when looking up a
unique key.  However, the advantages for non-unique cases seem great enough to
accept this tradeoff.  Aside from simplifying and (sometimes) speeding up the
indexscan code, this will allow us to reimplement btbulkdelete as a largely
sequential scan instead of index-order traversal, thereby significantly
reducing the cost of VACUUM.  Those changes will come in a separate patch.

Original patch by Heikki Linnakangas, rework by Tom Lane.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.10 2006/04/25 22:46:05 tgl Exp $
d296 26
a321 4
VACUUM needs to do a linear scan of an index to search for empty leaf
pages and half-dead parent pages that can be deleted, as well as deleted
pages that can be reclaimed because they are older than all open
transactions.
@


1.10
log
@Arrange to cache btree metapage data in the relcache entry for the index,
thereby saving a visit to the metapage in most index searches/updates.
This wouldn't actually save any I/O (since in the old regime the metapage
generally stayed in cache anyway), but it does provide a useful decrease
in bufmgr traffic in high-contention scenarios.  Per my recent proposal.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.9 2006/01/17 00:09:00 tgl Exp $
d70 16
a85 7
Read locks on a page are held for as long as a scan is examining a page.
But nbtree.c arranges to drop the read lock, but not the buffer pin,
on the current page of a scan before control leaves nbtree.  When we
come back to resume the scan, we have to re-grab the read lock and
then move right if the current item moved (see _bt_restscan()).  Keeping
the pin ensures that the current item cannot move left or be deleted
(see btbulkdelete).
d131 27
a157 8
Deletions of leaf items are handled by getting a super-exclusive lock on
the target page, so that no other backend has a pin on the page when the
deletion starts.  This means no scan is pointing at the page, so no other
backend can lose its place due to the item deletion.

The above does not work for deletion of items in internal pages, since
other backends keep no lock nor pin on a page they have descended past.
Instead, when a backend is ascending the tree using its stack, it must
d232 1
a232 1
Stepping left in a backward scan is complicated because we must consider
@


1.9
log
@Improve comments about btree's use of ScanKey data structures: there
are two basically different kinds of scankeys, and we ought to try harder
to indicate which is used in each place in the code.  I've chosen the names
"search scankey" and "insertion scankey", though you could make about
as good an argument for "operator scankey" and "comparison function
scankey".
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.8 2003/11/29 19:51:40 pgsql Exp $
d319 11
a329 1
root ("fast" root).
@


1.8
log
@
$Header: -> $PostgreSQL Changes ...
@
text
@d1 1
a1 1
$PostgreSQL: /cvsroot/pgsql-server/src/backend/access/nbtree/README,v 1.7 2003/02/21 00:06:21 tgl Exp $
d328 20
a347 9
in this code.  Searches for the initial position for a scan, as well as
insertions, use scankeys in which the comparison function is a 3-way
comparator (<0, =0, >0 result).  These scankeys are built within the
btree code (eg, by _bt_mkscankey()) and used by _bt_compare().  Once we
are positioned, sequential examination of tuples in a scan is done by
_bt_checkkeys() using scankeys in which the comparison functions return
booleans --- for example, int4lt might be used.  These scankeys are the
ones originally passed in from outside the btree code.  Same
representation, but different comparison functions!
@


1.8.4.1
log
@Fix "failed to re-find parent key" btree VACUUM failure by tweaking
_bt_pagedel to recover from the failure: just search the whole parent level
if searching to the right fails.  This does nothing for the underlying problem
that index keys became out-of-order in the grandparent level.  However, we
believe that there is no other consequence worse than slightly inefficient
searching, so this narrow patch seems like the safest solution for the back
branches.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.8 2003/11/29 19:51:40 pgsql Exp $
d192 3
a194 18
done by reference to child page numbers, not keys.  Searches may sometimes
descend to the half-dead page and then have to move right, rather than going
directly to the sibling page, but this is no different from the behavior
during a split.

A special case that arises from using half-dead pages for rightmost children
is that it's possible for the grandparent level's sequence of keys to become
out-of-order.  This occurs when there are a large number of insertions into
the key space that's been implicitly transferred to the right sibling of the
half-dead page's parent.  If the right sibling itself splits, the split
bounding key (which could be less than the high key of the parent page) is
inserted into the grandparent level to the right of the parent page.  This
is pretty ugly, but it causes no serious damage.  Searches, again, may descend
a bit to the left of the optimal path but will be able to recover.  The only
problem is that when it comes time to delete the half-dead page, _bt_pagedel's
normal strategy for finding the target page's parent can fail: the search for
the page's high key may well descend to the right of the parent.  In this case
we recover by searching from the left end of the parent level.
@


1.8.6.1
log
@Fix "failed to re-find parent key" btree VACUUM failure by tweaking
_bt_pagedel to recover from the failure: just search the whole parent level
if searching to the right fails.  This does nothing for the underlying problem
that index keys became out-of-order in the grandparent level.  However, we
believe that there is no other consequence worse than slightly inefficient
searching, so this narrow patch seems like the safest solution for the back
branches.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.8 2003/11/29 19:51:40 pgsql Exp $
d192 3
a194 18
done by reference to child page numbers, not keys.  Searches may sometimes
descend to the half-dead page and then have to move right, rather than going
directly to the sibling page, but this is no different from the behavior
during a split.

A special case that arises from using half-dead pages for rightmost children
is that it's possible for the grandparent level's sequence of keys to become
out-of-order.  This occurs when there are a large number of insertions into
the key space that's been implicitly transferred to the right sibling of the
half-dead page's parent.  If the right sibling itself splits, the split
bounding key (which could be less than the high key of the parent page) is
inserted into the grandparent level to the right of the parent page.  This
is pretty ugly, but it causes no serious damage.  Searches, again, may descend
a bit to the left of the optimal path but will be able to recover.  The only
problem is that when it comes time to delete the half-dead page, _bt_pagedel's
normal strategy for finding the target page's parent can fail: the search for
the page's high key may well descend to the right of the parent.  In this case
we recover by searching from the left end of the parent level.
@


1.7
log
@Make btree index structure adjustments and WAL logging changes needed to
support btree compaction, as per proposal of a few days ago.  btree index
pages no longer store parent links, instead they have a level indicator
(counting up from zero for leaf pages).  The FixBTree recovery logic is
removed, and replaced by code that detects missing parent-level insertions
during WAL replay.  Also, generate appropriate WAL entries when updating
btree metapage and when building a btree index from scratch.  I believe
btree indexes are now completely WAL-legal for the first time.
initdb forced due to index and WAL changes.
@
text
@d1 1
a1 1
$Header: /cvsroot/pgsql-server/src/backend/access/nbtree/README,v 1.6 2002/10/20 20:47:31 tgl Exp $
@


1.7.4.1
log
@Fix "failed to re-find parent key" btree VACUUM failure by tweaking
_bt_pagedel to recover from the failure: just search the whole parent level
if searching to the right fails.  This does nothing for the underlying problem
that index keys became out-of-order in the grandparent level.  However, we
believe that there is no other consequence worse than slightly inefficient
searching, so this narrow patch seems like the safest solution for the back
branches.
@
text
@d1 1
a1 1
$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.7 2003/02/21 00:06:21 tgl Exp $
d192 3
a194 18
done by reference to child page numbers, not keys.  Searches may sometimes
descend to the half-dead page and then have to move right, rather than going
directly to the sibling page, but this is no different from the behavior
during a split.

A special case that arises from using half-dead pages for rightmost children
is that it's possible for the grandparent level's sequence of keys to become
out-of-order.  This occurs when there are a large number of insertions into
the key space that's been implicitly transferred to the right sibling of the
half-dead page's parent.  If the right sibling itself splits, the split
bounding key (which could be less than the high key of the parent page) is
inserted into the grandparent level to the right of the parent page.  This
is pretty ugly, but it causes no serious damage.  Searches, again, may descend
a bit to the left of the optimal path but will be able to recover.  The only
problem is that when it comes time to delete the half-dead page, _bt_pagedel's
normal strategy for finding the target page's parent can fail: the search for
the page's high key may well descend to the right of the parent.  In this case
we recover by searching from the left end of the parent level.
@


1.6
log
@Fix potential problem with btbulkdelete deleting an indexscan's current
item, if the page containing the current item is split while the indexscan
is stopped and holds no read-lock on the page.  The current item might
move right onto a page that the indexscan holds no pin on.  In the prior
code this would allow btbulkdelete to reach and possibly delete the item,
causing 'my bits moved right off the end of the world!' when the indexscan
finally resumes.  Fix by chaining read-locks to the right during
_bt_restscan and requiring btbulkdelete to LockBufferForCleanup on every
page it scans, not only those with deletable items.  Per my pghackers
message of 25-May-02.  (Too bad no one could think of a better way.)
@
text
@d1 1
a1 1
$Header: /cvsroot/pgsql-server/src/backend/access/nbtree/README,v 1.5 2001/07/15 22:48:16 tgl Exp $
d6 4
a9 1
on Database Systems, Vol 6, No. 4, December 1981, pp 650-670).
d11 4
a14 1
We have made the following changes in order to incorporate their algorithm
d17 362
a378 176
+  The requirement that all btree keys be unique is too onerous,
   but the algorithm won't work correctly without it.  Fortunately, it is
   only necessary that keys be unique on a single tree level, because L&Y
   only use the assumption of key uniqueness when re-finding a key in a
   parent node (to determine where to insert the key for a split page).
   Therefore, we can use the link field to disambiguate multiple
   occurrences of the same user key: only one entry in the parent level
   will be pointing at the page we had split.  (Indeed we need not look at
   the real "key" at all, just at the link field.)  We can distinguish
   items at the leaf level in the same way, by examining their links to
   heap tuples; we'd never have two items for the same heap tuple.

+  Lehman and Yao assume that the key range for a subtree S is described
   by Ki < v <= Ki+1 where Ki and Ki+1 are the adjacent keys in the parent
   node.  This does not work for nonunique keys (for example, if we have
   enough equal keys to spread across several leaf pages, there *must* be
   some equal bounding keys in the first level up).  Therefore we assume
   Ki <= v <= Ki+1 instead.  A search that finds exact equality to a
   bounding key in an upper tree level must descend to the left of that
   key to ensure it finds any equal keys in the preceding page.  An
   insertion that sees the high key of its target page is equal to the key
   to be inserted has a choice whether or not to move right, since the new
   key could go on either page.  (Currently, we try to find a page where
   there is room for the new key without a split.)

+  Lehman and Yao don't require read locks, but assume that in-memory
   copies of tree nodes are unshared.  Postgres shares in-memory buffers
   among backends.  As a result, we do page-level read locking on btree
   nodes in order to guarantee that no record is modified while we are
   examining it.  This reduces concurrency but guaranteees correct
   behavior.  An advantage is that when trading in a read lock for a
   write lock, we need not re-read the page after getting the write lock.
   Since we're also holding a pin on the shared buffer containing the
   page, we know that buffer still contains the page and is up-to-date.

+  We support the notion of an ordered "scan" of an index as well as
   insertions, deletions, and simple lookups.  A scan in the forward
   direction is no problem, we just use the right-sibling pointers that
   L&Y require anyway.  (Thus, once we have descended the tree to the
   correct start point for the scan, the scan looks only at leaf pages
   and never at higher tree levels.)  To support scans in the backward
   direction, we also store a "left sibling" link much like the "right
   sibling".  (This adds an extra step to the L&Y split algorithm: while
   holding the write lock on the page being split, we also lock its former
   right sibling to update that page's left-link.  This is safe since no
   writer of that page can be interested in acquiring a write lock on our
   page.)  A backwards scan has one additional bit of complexity: after
   following the left-link we must account for the possibility that the
   left sibling page got split before we could read it.  So, we have to
   move right until we find a page whose right-link matches the page we
   came from.

+  Read locks on a page are held for as long as a scan is examining a page.
   But nbtree.c arranges to drop the read lock, but not the buffer pin,
   on the current page of a scan before control leaves nbtree.  When we
   come back to resume the scan, we have to re-grab the read lock and
   then move right if the current item moved (see _bt_restscan()).  Keeping
   the pin ensures that the current item cannot move left or be deleted
   (see btbulkdelete).

+  In most cases we release our lock and pin on a page before attempting
   to acquire pin and lock on the page we are moving to.  In a few places
   it is necessary to lock the next page before releasing the current one.
   This is safe when moving right or up, but not when moving left or down
   (else we'd create the possibility of deadlocks).

+  Lehman and Yao fail to discuss what must happen when the root page
   becomes full and must be split.  Our implementation is to split the
   root in the same way that any other page would be split, then construct
   a new root page holding pointers to both of the resulting pages (which
   now become siblings on level 2 of the tree).  The new root page is then
   installed by altering the root pointer in the meta-data page (see
   below).  This works because the root is not treated specially in any
   other way --- in particular, searches will move right using its link
   pointer if the link is set.  Therefore, searches will find the data
   that's been moved into the right sibling even if they read the metadata
   page before it got updated.  This is the same reasoning that makes a
   split of a non-root page safe.  The locking considerations are similar too.

+  Lehman and Yao assume fixed-size keys, but we must deal with
   variable-size keys.  Therefore there is not a fixed maximum number of
   keys per page; we just stuff in as many as will fit.  When we split a
   page, we try to equalize the number of bytes, not items, assigned to
   each of the resulting pages.  Note we must include the incoming item in
   this calculation, otherwise it is possible to find that the incoming
   item doesn't fit on the split page where it needs to go!

In addition, the following things are handy to know:

+  Page zero of every btree is a meta-data page.  This page stores
   the location of the root page, a pointer to a list of free
   pages, and other stuff that's handy to know.  (Currently, we
   never shrink btree indexes so there are never any free pages.)

+  The algorithm assumes we can fit at least three items per page
   (a "high key" and two real data items).  Therefore it's unsafe
   to accept items larger than 1/3rd page size.  Larger items would
   work sometimes, but could cause failures later on depending on
   what else gets put on their page.

+  This algorithm doesn't guarantee btree consistency after a kernel crash
   or hardware failure.  To do that, we'd need ordered writes, and UNIX
   doesn't support ordered writes (short of fsync'ing every update, which
   is too high a price).  Rebuilding corrupted indexes during restart
   seems more attractive.

+  Deletions are handled by getting a super-exclusive lock on the target
   page, so that no other backend has a pin on the page when the deletion
   starts.  This means no scan is pointing at the page.  This is OK for
   deleting leaf items, probably not OK for deleting internal nodes;
   will need to think harder when it's time to support index compaction.

+  "ScanKey" data structures are used in two fundamentally different ways
   in this code.  Searches for the initial position for a scan, as well as
   insertions, use scankeys in which the comparison function is a 3-way
   comparator (<0, =0, >0 result).  These scankeys are built within the
   btree code (eg, by _bt_mkscankey()) and used by _bt_compare().  Once we
   are positioned, sequential examination of tuples in a scan is done by
   _bt_checkkeys() using scankeys in which the comparison functions return
   booleans --- for example, int4lt might be used.  These scankeys are the
   ones originally passed in from outside the btree code.  Same
   representation, but different comparison functions!

Notes about data representation:

+  The right-sibling link required by L&Y is kept in the page "opaque
   data" area, as is the left-sibling link and some flags.

+  We also keep a parent link in the opaque data, but this link is not
   very trustworthy because it is not updated when the parent page splits.
   Thus, it points to some page on the parent level, but possibly a page
   well to the left of the page's actual current parent.  In most cases
   we do not need this link at all.  Normally we return to a parent page
   using a stack of entries that are made as we descend the tree, as in L&Y.
   There is exactly one case where the stack will not help: concurrent
   root splits.  If an inserter process needs to split what had been the
   root when it started its descent, but finds that that page is no longer
   the root (because someone else split it meanwhile), then it uses the
   parent link to move up to the next level.  This is OK because we do fix
   the parent link in a former root page when splitting it.  This logic
   will work even if the root is split multiple times (even up to creation
   of multiple new levels) before an inserter returns to it.  The same
   could not be said of finding the new root via the metapage, since that
   would work only for a single level of added root.

+  The Postgres disk block data format (an array of items) doesn't fit
   Lehman and Yao's alternating-keys-and-pointers notion of a disk page,
   so we have to play some games.

+  On a page that is not rightmost in its tree level, the "high key" is
   kept in the page's first item, and real data items start at item 2.
   The link portion of the "high key" item goes unused.  A page that is
   rightmost has no "high key", so data items start with the first item.
   Putting the high key at the left, rather than the right, may seem odd,
   but it avoids moving the high key as we add data items.

+  On a leaf page, the data items are simply links to (TIDs of) tuples
   in the relation being indexed, with the associated key values.

+  On a non-leaf page, the data items are down-links to child pages with
   bounding keys.  The key in each data item is the *lower* bound for
   keys on that child page, so logically the key is to the left of that
   downlink.  The high key (if present) is the upper bound for the last
   downlink.  The first data item on each such page has no lower bound
   --- or lower bound of minus infinity, if you prefer.  The comparison
   routines must treat it accordingly.  The actual key stored in the
   item is irrelevant, and need not be stored at all.  This arrangement
   corresponds to the fact that an L&Y non-leaf page has one more pointer
   than key.

Notes to operator class implementors:

+  With this implementation, we require each supported datatype to supply
   us with a comparison procedure via pg_amproc.  This procedure must take
   two nonnull values A and B and return an int32 < 0, 0, or > 0 if A < B,
   A = B, or A > B, respectively.  See nbtcompare.c for examples.
@


1.5
log
@Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers.  Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc.  (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.)  The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method.  I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions.  Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.

Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).

Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error.  We discovered this need long
ago for btree, but missed the other guys.

Oh, one more thing: concurrent VACUUM is now the default.
@
text
@d1 1
a1 1
$Header: /home/projects/pgsql/cvsroot/pgsql/src/backend/access/nbtree/README,v 1.4 2000/07/25 05:26:40 tgl Exp $
d63 2
a64 5
+  Read locks on a page are held for as long as a scan has a pointer
   to the page.  However, locks are always surrendered before the
   sibling page lock is acquired (for readers), so we remain deadlock-
   free.  I will do a formal proof if I get bored anytime soon.
   NOTE: nbtree.c arranges to drop the read lock, but not the buffer pin,
d67 9
a75 1
   then move right if the current item moved (see _bt_restscan()).
@


1.4
log
@Add commentary about varying usage of scankeys in btree code.
@
text
@d1 1
a1 1
$Header: /home/projects/pgsql/cvsroot/pgsql/src/backend/access/nbtree/README,v 1.3 2000/07/21 22:14:09 tgl Exp $
d112 5
a116 9
+  On deletions, we need to adjust the position of active scans on
   the index.  The code in nbtscan.c handles this.  We don't need to
   do this for insertions or splits because _bt_restscan can find the
   new position of the previously-found item.  NOTE that nbtscan.c
   only copes with deletions issued by the current backend.  This
   essentially means that concurrent deletions are not supported, but
   that's true already in the Lehman and Yao algorithm.  nbtscan.c
   exists only to support VACUUM and allow it to delete items while
   it's scanning the index.
@


1.3
log
@Further cleanup of btbuild (CREATE INDEX).  Avoid storing unneeded
left keys during bottom-up index build, and leave some free space
instead of packing the pages to the brim (so as to avoid vast numbers
of page splits during the first interactive insertions).
@
text
@d1 1
a1 1
$Header: /home/projects/pgsql/cvsroot/pgsql/src/backend/access/nbtree/README,v 1.2 2000/07/21 06:42:32 tgl Exp $
d121 11
@


1.2
log
@Major overhaul of btree index code.  Eliminate special BTP_CHAIN logic for
duplicate keys by letting search go to the left rather than right when an
equal key is seen at an upper tree level.  Fix poor choice of page split
point (leading to insertion failures) that was forced by chaining logic.
Don't store leftmost key in non-leaf pages, since it's not necessary.
Don't create root page until something is first stored in the index, so an
unused index is now 8K not 16K.  (Doesn't seem to be as easy to get rid of
the metadata page, unfortunately.)  Massive cleanup of unreadable code,
fix poor, obsolete, and just plain wrong documentation and comments.
See src/backend/access/nbtree/README for the gory details.
@
text
@d1 1
a1 1
$Header: /home/projects/pgsql/cvsroot/pgsql/src/backend/access/nbtree/README,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
d171 4
a174 5
+  With this implementation, we require the user to supply us with
   a procedure for pg_amproc.  This procedure should take two keys
   A and B and return < 0, 0, or > 0 if A < B, A = B, or A > B,
   respectively.  See the contents of that relation for the btree
   access method for some samples.
@


1.1
log
@Initial revision
@
text
@d1 1
a1 1
$Header: /usr/local/cvsroot/postgres95/postgres95/src/backend/access/nbtree/README,v 1.1.1.1 1996/07/09 05:31:28 scrappy Exp $
d4 4
a7 1
btree management algorithm that supports concurrent access for Postgres.
d11 81
a91 22
	+  The requirement that all btree keys be unique is too onerous,
	   but the algorithm won't work correctly without it.  As a result,
	   this implementation adds an OID (guaranteed to be unique) to
	   every key in the index.  This guarantees uniqueness within a set
	   of duplicates.  Space overhead is four bytes.

	   For this reason, when we're passed an index tuple to store by the
	   common access method code, we allocate a larger one and copy the
	   supplied tuple into it.  No Postgres code outside of the btree
	   access method knows about this xid or sequence number.

	+  Lehman and Yao don't require read locks, but assume that in-
	   memory copies of tree nodes are unshared.  Postgres shares
	   in-memory buffers among backends.  As a result, we do page-
	   level read locking on btree nodes in order to guarantee that
	   no record is modified while we are examining it.  This reduces
	   concurrency but guaranteees correct behavior.

	+  Read locks on a page are held for as long as a scan has a pointer
	   to the page.  However, locks are always surrendered before the
	   sibling page lock is acquired (for readers), so we remain deadlock-
	   free.  I will do a formal proof if I get bored anytime soon.
d95 73
a167 18
	+  Page zero of every btree is a meta-data page.  This page stores
	   the location of the root page, a pointer to a list of free
	   pages, and other stuff that's handy to know.

	+  This algorithm doesn't really work, since it requires ordered
	   writes, and UNIX doesn't support ordered writes.

	+  There's one other case where we may screw up in this
	   implementation.  When we start a scan, we descend the tree
	   to the key nearest the one in the qual, and once we get there,
	   position ourselves correctly for the qual type (eg, <, >=, etc).
	   If we happen to step off a page, decide we want to get back to
	   it, and fetch the page again, and if some bad person has split
	   the page and moved the last tuple we saw off of it, then the
	   code complains about botched concurrency in an elog(WARN, ...)
	   and gives up the ghost.  This is the ONLY violation of Lehman
	   and Yao's guarantee of correct behavior that I am aware of in
	   this code.
d171 5
a175 15
	With this implementation, we require the user to supply us with
	a procedure for pg_amproc.  This procedure should take two keys
	A and B and return < 0, 0, or > 0 if A < B, A = B, or A > B,
	respectively.  See the contents of that relation for the btree
	access method for some samples.

Notes to mao for implementation document:

	On deletions, we need to adjust the position of active scans on
	the index.  The code in nbtscan.c handles this.  We don't need to
	do this for splits because of the way splits are handled; if they
	happen behind us, we'll automatically go to the next page, and if
	they happen in front of us, we're not affected by them.  For
	insertions, if we inserted a tuple behind the current scan location
	on the current scan page, we move one space ahead.
@


1.1.1.1
log
@Postgres95 1.01 Distribution - Virgin Sources
@
text
@@
