head	1.17;
access;
symbols
	REL9_0_0:1.17
	REL9_1_ALPHA1:1.17
	REL9_0_RC1:1.17
	REL9_0_BETA4:1.17
	REL9_0_STABLE:1.17.0.14
	REL9_0_BETA3:1.17
	REL9_0_BETA2:1.17
	REL7_4_29:1.4
	REL8_0_25:1.7.4.1
	REL8_1_21:1.8
	REL8_2_17:1.11
	REL8_3_11:1.12
	REL8_4_4:1.17
	REL9_0_BETA1:1.17
	REL9_0_ALPHA5_BRANCH:1.17.0.12
	REL9_0_ALPHA5:1.17
	REL7_4_28:1.4
	REL8_0_24:1.7.4.1
	REL8_1_20:1.8
	REL8_2_16:1.11
	REL8_3_10:1.12
	REL8_4_3:1.17
	REL9_0_ALPHA4:1.17
	REL9_0_ALPHA4_BRANCH:1.17.0.10
	REL8_5_ALPHA3:1.17
	REL8_5_ALPHA3_BRANCH:1.17.0.8
	REL7_4_27:1.4
	REL8_0_23:1.7.4.1
	REL8_1_19:1.8
	REL8_2_15:1.11
	REL8_3_9:1.12
	REL8_4_2:1.17
	REL8_5_ALPHA2:1.17
	REL8_5_ALPHA2_BRANCH:1.17.0.6
	REL7_4_26:1.4
	REL8_0_22:1.7.4.1
	REL8_1_18:1.8
	REL8_2_14:1.11
	REL8_3_8:1.12
	REL8_4_1:1.17
	REL8_5_ALPHA1:1.17
	REL8_5_ALPHA1_BRANCH:1.17.0.4
	REL8_4_STABLE:1.17.0.2
	REL8_4_0:1.17
	REL8_4_RC2:1.17
	REL8_4_RC1:1.16
	REL8_4_BETA2:1.16
	REL8_4_BETA1:1.16
	REL7_4_25:1.4
	REL8_0_21:1.7.4.1
	REL8_1_17:1.8
	REL8_2_13:1.11
	REL8_3_7:1.12
	REL7_4_24:1.4
	REL8_0_20:1.7.4.1
	REL8_1_16:1.8
	REL8_2_12:1.11
	REL8_3_6:1.12
	REL7_4_23:1.4
	REL8_0_19:1.7.4.1
	REL8_1_15:1.8
	REL8_2_11:1.11
	REL8_3_5:1.12
	REL7_4_22:1.4
	REL8_0_18:1.7.4.1
	REL8_1_14:1.8
	REL8_2_10:1.11
	REL8_3_4:1.12
	REL7_4_21:1.4
	REL8_0_17:1.7.4.1
	REL8_1_13:1.8
	REL8_2_9:1.11
	REL8_3_3:1.12
	REL7_4_20:1.4
	REL8_0_16:1.7.4.1
	REL8_1_12:1.8
	REL8_2_8:1.11
	REL8_3_2:1.12
	REL8_2_7:1.11
	REL8_3_1:1.12
	REL8_3_STABLE:1.12.0.2
	REL8_3_0:1.12
	REL8_3_RC2:1.12
	REL7_3_21:1.3
	REL7_4_19:1.4
	REL8_0_15:1.7.4.1
	REL8_1_11:1.8
	REL8_2_6:1.11
	REL8_3_RC1:1.12
	REL8_3_BETA4:1.12
	REL8_3_BETA3:1.12
	REL8_3_BETA2:1.12
	REL8_3_BETA1:1.12
	REL7_3_20:1.3
	REL7_4_18:1.4
	REL8_0_14:1.7.4.1
	REL8_1_10:1.8
	REL8_2_5:1.11
	REL7_3_19:1.3
	REL7_4_17:1.4
	REL8_0_13:1.7.4.1
	REL8_1_9:1.8
	REL8_2_4:1.11
	REL8_0_12:1.7.4.1
	REL8_1_8:1.8
	REL8_2_3:1.11
	REL7_3_18:1.3
	REL7_4_16:1.4
	REL8_0_11:1.7.4.1
	REL8_1_7:1.8
	REL8_2_2:1.11
	REL8_0_10:1.7.4.1
	REL8_1_6:1.8
	REL8_2_1:1.11
	REL7_4_15:1.4
	REL7_3_17:1.3
	REL8_2_STABLE:1.11.0.2
	REL8_2_0:1.11
	REL8_2_RC1:1.11
	REL8_2_BETA3:1.11
	REL8_2_BETA2:1.11
	REL8_1_5:1.8
	REL8_0_9:1.7.4.1
	REL7_4_14:1.4
	REL7_3_16:1.3
	REL8_2_BETA1:1.11
	REL7_3_15:1.3
	REL7_4_13:1.4
	REL8_0_8:1.7.4.1
	REL8_1_4:1.8
	REL7_3_14:1.3
	REL7_4_12:1.4
	REL8_0_7:1.7.4.1
	REL8_1_3:1.8
	REL7_3_13:1.3
	REL7_4_11:1.4
	REL8_0_6:1.7.4.1
	REL8_1_2:1.8
	REL7_3_12:1.3
	REL7_4_10:1.4
	REL8_0_5:1.7.4.1
	REL8_1_1:1.8
	REL8_1_STABLE:1.8.0.2
	REL8_1_0:1.8
	REL8_1_0RC1:1.8
	REL8_1_0BETA4:1.8
	REL8_1_0BETA3:1.8
	REL7_3_11:1.3
	REL7_4_9:1.4
	REL8_0_4:1.7.4.1
	REL8_1_0BETA2:1.8
	REL8_1_0BETA1:1.8
	REL7_2_8:1.3
	REL7_3_10:1.3
	REL7_4_8:1.4
	REL8_0_3:1.7.4.1
	REL8_0_2:1.7.4.1
	REL7_2_7:1.3
	REL7_3_9:1.3
	REL7_4_7:1.4
	REL8_0_1:1.7
	REL8_0_STABLE:1.7.0.4
	REL8_0_0:1.7.0.2
	REL8_0_0RC5:1.7
	REL8_0_0RC4:1.7
	REL8_0_0RC3:1.7
	REL8_0_0RC2:1.7
	REL8_0_0RC1:1.7
	REL8_0_0BETA5:1.7
	REL8_0_0BETA4:1.7
	REL7_4_6:1.4
	REL7_3_8:1.3
	REL7_2_6:1.3
	REL8_0_0BETA3:1.7
	REL8_0_0BETA2:1.7
	REL7_2_5:1.3
	REL7_4_5:1.4
	REL7_3_7:1.3
	REL7_4_4:1.4
	REL8_0_0BETA1:1.7
	REL7_4_3:1.4
	REL7_4_2:1.4
	REL7_3_6:1.3
	REL7_4_1:1.4
	REL7_3_5:1.3
	REL7_4:1.4
	REL7_4_RC2:1.4
	REL7_4_STABLE:1.4.0.2
	REL7_4_RC1:1.4
	REL7_4_BETA5:1.3
	REL7_4_BETA4:1.3
	REL7_4_BETA3:1.3
	REL7_4_BETA2:1.3
	WIN32_DEV:1.3.0.6
	REL7_4_BETA1:1.3
	REL7_3_4:1.3
	REL7_3_2:1.3
	REL7_2_4:1.3
	REL7_3_STABLE:1.3.0.4
	REL7_2_3:1.3
	REL7_2_STABLE:1.3.0.2
	REL7_2:1.3
	REL7_2_RC2:1.3
	REL7_2_RC1:1.3
	REL7_2_BETA5:1.3
	REL7_2_BETA4:1.3
	REL7_2_BETA3:1.3
	REL7_2_BETA2:1.3
	REL7_2_BETA1:1.3;
locks; strict;
comment	@# @;


1.17
date	2009.06.22.20.04.28;	author tgl;	state Exp;
branches;
next	1.16;

1.16
date	2009.02.18.15.58.41;	author heikki;	state Exp;
branches;
next	1.15;

1.15
date	2008.11.06.20.51.14;	author tgl;	state Exp;
branches;
next	1.14;

1.14
date	2008.03.21.13.23.28;	author momjian;	state Exp;
branches;
next	1.13;

1.13
date	2008.03.20.17.55.15;	author momjian;	state Exp;
branches;
next	1.12;

1.12
date	2007.05.30.20.11.58;	author tgl;	state Exp;
branches;
next	1.11;

1.11
date	2006.07.23.03.07.58;	author tgl;	state Exp;
branches;
next	1.10;

1.10
date	2006.06.08.14.58.33;	author tgl;	state Exp;
branches;
next	1.9;

1.9
date	2006.03.31.23.32.06;	author tgl;	state Exp;
branches;
next	1.8;

1.8
date	2005.03.04.20.21.06;	author tgl;	state Exp;
branches;
next	1.7;

1.7
date	2004.04.19.23.27.17;	author tgl;	state Exp;
branches
	1.7.4.1;
next	1.6;

1.6
date	2003.11.29.19.51.56;	author pgsql;	state Exp;
branches;
next	1.5;

1.5
date	2003.11.14.04.32.11;	author wieck;	state Exp;
branches;
next	1.4;

1.4
date	2003.10.31.22.48.08;	author tgl;	state Exp;
branches;
next	1.3;

1.3
date	2001.09.29.04.02.22;	author tgl;	state Exp;
branches;
next	1.2;

1.2
date	2001.08.25.18.52.42;	author tgl;	state Exp;
branches;
next	1.1;

1.1
date	2001.07.06.21.04.25;	author tgl;	state Exp;
branches;
next	;

1.7.4.1
date	2005.03.03.16.47.43;	author tgl;	state Exp;
branches;
next	;


desc
@@


1.17
log
@For bulk write operations (eg COPY IN), use a ring buffer of 16MB instead
of the 256KB limit originally enforced by a patch committed 2008-11-06.
Per recent test results, the smaller size resulted in an undesirable decrease
in bulk data loading speed, due to COPY processing frequently getting blocked
for WAL flushing.  This area might need more tweaking later, but this setting
seems to be good enough for 8.4.
@
text
@$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.16 2009/02/18 15:58:41 heikki Exp $

Notes About Shared Buffer Access Rules
======================================

There are two separate access control mechanisms for shared disk buffers:
reference counts (a/k/a pin counts) and buffer content locks.  (Actually,
there's a third level of access control: one must hold the appropriate kind
of lock on a relation before one can legally access any page belonging to
the relation.  Relation-level locks are not discussed here.)

Pins: one must "hold a pin on" a buffer (increment its reference count)
before being allowed to do anything at all with it.  An unpinned buffer is
subject to being reclaimed and reused for a different page at any instant,
so touching it is unsafe.  Normally a pin is acquired via ReadBuffer and
released via ReleaseBuffer.  It is OK and indeed common for a single
backend to pin a page more than once concurrently; the buffer manager
handles this efficiently.  It is considered OK to hold a pin for long
intervals --- for example, sequential scans hold a pin on the current page
until done processing all the tuples on the page, which could be quite a
while if the scan is the outer scan of a join.  Similarly, btree index
scans hold a pin on the current index page.  This is OK because normal
operations never wait for a page's pin count to drop to zero.  (Anything
that might need to do such a wait is instead handled by waiting to obtain
the relation-level lock, which is why you'd better hold one first.)  Pins
may not be held across transaction boundaries, however.

Buffer content locks: there are two kinds of buffer lock, shared and exclusive,
which act just as you'd expect: multiple backends can hold shared locks on
the same buffer, but an exclusive lock prevents anyone else from holding
either shared or exclusive lock.  (These can alternatively be called READ
and WRITE locks.)  These locks are intended to be short-term: they should not
be held for long.  Buffer locks are acquired and released by LockBuffer().
It will *not* work for a single backend to try to acquire multiple locks on
the same buffer.  One must pin a buffer before trying to lock it.

Buffer access rules:

1. To scan a page for tuples, one must hold a pin and either shared or
exclusive content lock.  To examine the commit status (XIDs and status bits)
of a tuple in a shared buffer, one must likewise hold a pin and either shared
or exclusive lock.

2. Once one has determined that a tuple is interesting (visible to the
current transaction) one may drop the content lock, yet continue to access
the tuple's data for as long as one holds the buffer pin.  This is what is
typically done by heap scans, since the tuple returned by heap_fetch
contains a pointer to tuple data in the shared buffer.  Therefore the
tuple cannot go away while the pin is held (see rule #5).  Its state could
change, but that is assumed not to matter after the initial determination
of visibility is made.

3. To add a tuple or change the xmin/xmax fields of an existing tuple,
one must hold a pin and an exclusive content lock on the containing buffer.
This ensures that no one else might see a partially-updated state of the
tuple while they are doing visibility checks.

4. It is considered OK to update tuple commit status bits (ie, OR the
values HEAP_XMIN_COMMITTED, HEAP_XMIN_INVALID, HEAP_XMAX_COMMITTED, or
HEAP_XMAX_INVALID into t_infomask) while holding only a shared lock and
pin on a buffer.  This is OK because another backend looking at the tuple
at about the same time would OR the same bits into the field, so there
is little or no risk of conflicting update; what's more, if there did
manage to be a conflict it would merely mean that one bit-update would
be lost and need to be done again later.  These four bits are only hints
(they cache the results of transaction status lookups in pg_clog), so no
great harm is done if they get reset to zero by conflicting updates.

5. To physically remove a tuple or compact free space on a page, one
must hold a pin and an exclusive lock, *and* observe while holding the
exclusive lock that the buffer's shared reference count is one (ie,
no other backend holds a pin).  If these conditions are met then no other
backend can perform a page scan until the exclusive lock is dropped, and
no other backend can be holding a reference to an existing tuple that it
might expect to examine again.  Note that another backend might pin the
buffer (increment the refcount) while one is performing the cleanup, but
it won't be able to actually examine the page until it acquires shared
or exclusive content lock.


Rule #5 only affects VACUUM operations.  Obtaining the
necessary lock is done by the bufmgr routine LockBufferForCleanup().
It first gets an exclusive lock and then checks to see if the shared pin
count is currently 1.  If not, it releases the exclusive lock (but not the
caller's pin) and waits until signaled by another backend, whereupon it
tries again.  The signal will occur when UnpinBuffer decrements the shared
pin count to 1.  As indicated above, this operation might have to wait a
good while before it acquires lock, but that shouldn't matter much for
concurrent VACUUM.  The current implementation only supports a single
waiter for pin-count-1 on any particular shared buffer.  This is enough
for VACUUM's use, since we don't allow multiple VACUUMs concurrently on a
single relation anyway.


Buffer Manager's Internal Locking
---------------------------------

Before PostgreSQL 8.1, all operations of the shared buffer manager itself
were protected by a single system-wide lock, the BufMgrLock, which
unsurprisingly proved to be a source of contention.  The new locking scheme
avoids grabbing system-wide exclusive locks in common code paths.  It works
like this:

* There is a system-wide LWLock, the BufMappingLock, that notionally
protects the mapping from buffer tags (page identifiers) to buffers.
(Physically, it can be thought of as protecting the hash table maintained
by buf_table.c.)  To look up whether a buffer exists for a tag, it is
sufficient to obtain share lock on the BufMappingLock.  Note that one
must pin the found buffer, if any, before releasing the BufMappingLock.
To alter the page assignment of any buffer, one must hold exclusive lock
on the BufMappingLock.  This lock must be held across adjusting the buffer's
header fields and changing the buf_table hash table.  The only common
operation that needs exclusive lock is reading in a page that was not
in shared buffers already, which will require at least a kernel call
and usually a wait for I/O, so it will be slow anyway.

* As of PG 8.2, the BufMappingLock has been split into NUM_BUFFER_PARTITIONS
separate locks, each guarding a portion of the buffer tag space.  This allows
further reduction of contention in the normal code paths.  The partition
that a particular buffer tag belongs to is determined from the low-order
bits of the tag's hash value.  The rules stated above apply to each partition
independently.  If it is necessary to lock more than one partition at a time,
they must be locked in partition-number order to avoid risk of deadlock.

* A separate system-wide LWLock, the BufFreelistLock, provides mutual
exclusion for operations that access the buffer free list or select
buffers for replacement.  This is always taken in exclusive mode since
there are no read-only operations on those data structures.  The buffer
management policy is designed so that BufFreelistLock need not be taken
except in paths that will require I/O, and thus will be slow anyway.
(Details appear below.)  It is never necessary to hold the BufMappingLock
and the BufFreelistLock at the same time.

* Each buffer header contains a spinlock that must be taken when examining
or changing fields of that buffer header.  This allows operations such as
ReleaseBuffer to make local state changes without taking any system-wide
lock.  We use a spinlock, not an LWLock, since there are no cases where
the lock needs to be held for more than a few instructions.

Note that a buffer header's spinlock does not control access to the data
held within the buffer.  Each buffer header also contains an LWLock, the
"buffer content lock", that *does* represent the right to access the data
in the buffer.  It is used per the rules above.

There is yet another set of per-buffer LWLocks, the io_in_progress locks,
that are used to wait for I/O on a buffer to complete.  The process doing
a read or write takes exclusive lock for the duration, and processes that
need to wait for completion try to take shared locks (which they release
immediately upon obtaining).  XXX on systems where an LWLock represents
nontrivial resources, it's fairly annoying to need so many locks.  Possibly
we could use per-backend LWLocks instead (a buffer header would then contain
a field to show which backend is doing its I/O).


Normal Buffer Replacement Strategy
----------------------------------

There is a "free list" of buffers that are prime candidates for replacement.
In particular, buffers that are completely free (contain no valid page) are
always in this list.  We could also throw buffers into this list if we
consider their pages unlikely to be needed soon; however, the current
algorithm never does that.  The list is singly-linked using fields in the
buffer headers; we maintain head and tail pointers in global variables.
(Note: although the list links are in the buffer headers, they are
considered to be protected by the BufFreelistLock, not the buffer-header
spinlocks.)  To choose a victim buffer to recycle when there are no free
buffers available, we use a simple clock-sweep algorithm, which avoids the
need to take system-wide locks during common operations.  It works like
this:

Each buffer header contains a usage counter, which is incremented (up to a
small limit value) whenever the buffer is unpinned.  (This requires only the
buffer header spinlock, which would have to be taken anyway to decrement the
buffer reference count, so it's nearly free.)

The "clock hand" is a buffer index, NextVictimBuffer, that moves circularly
through all the available buffers.  NextVictimBuffer is protected by the
BufFreelistLock.

The algorithm for a process that needs to obtain a victim buffer is:

1. Obtain BufFreelistLock.

2. If buffer free list is nonempty, remove its head buffer.  If the buffer
is pinned or has a nonzero usage count, it cannot be used; ignore it and
return to the start of step 2.  Otherwise, pin the buffer, release
BufFreelistLock, and return the buffer.

3. Otherwise, select the buffer pointed to by NextVictimBuffer, and
circularly advance NextVictimBuffer for next time.

4. If the selected buffer is pinned or has a nonzero usage count, it cannot
be used.  Decrement its usage count (if nonzero) and return to step 3 to
examine the next buffer.

5. Pin the selected buffer, release BufFreelistLock, and return the buffer.

(Note that if the selected buffer is dirty, we will have to write it out
before we can recycle it; if someone else pins the buffer meanwhile we will
have to give up and try another buffer.  This however is not a concern
of the basic select-a-victim-buffer algorithm.)


Buffer Ring Replacement Strategy
---------------------------------

When running a query that needs to access a large number of pages just once,
such as VACUUM or a large sequential scan, a different strategy is used.
A page that has been touched only by such a scan is unlikely to be needed
again soon, so instead of running the normal clock sweep algorithm and
blowing out the entire buffer cache, a small ring of buffers is allocated
using the normal clock sweep algorithm and those buffers are reused for the
whole scan.  This also implies that much of the write traffic caused by such
a statement will be done by the backend itself and not pushed off onto other
processes.

For sequential scans, a 256KB ring is used. That's small enough to fit in L2
cache, which makes transferring pages from OS cache to shared buffer cache
efficient.  Even less would often be enough, but the ring must be big enough
to accommodate all pages in the scan that are pinned concurrently.  256KB
should also be enough to leave a small cache trail for other backends to
join in a synchronized seq scan.  If a ring buffer is dirtied and its LSN
updated, we would normally have to write and flush WAL before we could
re-use the buffer; in this case we instead discard the buffer from the ring
and (later) choose a replacement using the normal clock-sweep algorithm.
Hence this strategy works best for scans that are read-only (or at worst
update hint bits).  In a scan that modifies every page in the scan, like a
bulk UPDATE or DELETE, the buffers in the ring will always be dirtied and
the ring strategy effectively degrades to the normal strategy.

VACUUM uses a 256KB ring like sequential scans, but dirty pages are not
removed from the ring.  Instead, WAL is flushed if needed to allow reuse of
the buffers.  Before introducing the buffer ring strategy in 8.3, VACUUM's
buffers were sent to the freelist, which was effectively a buffer ring of 1
buffer, resulting in excessive WAL flushing.  Allowing VACUUM to update
256KB between WAL flushes should be more efficient.

Bulk writes work similarly to VACUUM.  Currently this applies only to
COPY IN and CREATE TABLE AS SELECT.  (Might it be interesting to make
seqscan UPDATE and DELETE use the bulkwrite strategy?)  For bulk writes
we use a ring size of 16MB (but not more than 1/8th of shared_buffers).
Smaller sizes have been shown to result in the COPY blocking too often
for WAL flushes.  While it's okay for a background vacuum to be slowed by
doing its own WAL flushing, we'd prefer that COPY not be subject to that,
so we let it use up a bit more of the buffer arena.


Background Writer's Processing
------------------------------

The background writer is designed to write out pages that are likely to be
recycled soon, thereby offloading the writing work from active backends.
To do this, it scans forward circularly from the current position of
NextVictimBuffer (which it does not change!), looking for buffers that are
dirty and not pinned nor marked with a positive usage count.  It pins,
writes, and releases any such buffer.

If we can assume that reading NextVictimBuffer is an atomic action, then
the writer doesn't even need to take the BufFreelistLock in order to look
for buffers to write; it needs only to spinlock each buffer header for long
enough to check the dirtybit.  Even without that assumption, the writer
only needs to take the lock long enough to read the variable value, not
while scanning the buffers.  (This is a very substantial improvement in
the contention cost of the writer compared to PG 8.0.)

During a checkpoint, the writer's strategy must be to write every dirty
buffer (pinned or not!).  We may as well make it start this scan from 
NextVictimBuffer, however, so that the first-to-be-written pages are the
ones that backends might otherwise have to write for themselves soon.

The background writer takes shared content lock on a buffer while writing it
out (and anyone else who flushes buffer contents to disk must do so too).
This ensures that the page image transferred to disk is reasonably consistent.
We might miss a hint-bit update or two but that isn't a problem, for the same
reasons mentioned under buffer access rules.

As of 8.4, background writer starts during recovery mode when there is
some form of potentially extended recovery to perform. It performs an
identical service to normal processing, except that checkpoints it
writes are technically restartpoints.
@


1.16
log
@Start background writer during archive recovery. Background writer now performs
its usual buffer cleaning duties during archive recovery, and it's responsible
for performing restartpoints.

This requires some changes in postmaster. When the startup process has done
all the initialization and is ready to start WAL redo, it signals the
postmaster to launch the background writer. The postmaster is signaled again
when the point in recovery is reached where we know that the database is in
consistent state. Postmaster isn't interested in that at the moment, but
that's the point where we could let other backends in to perform read-only
queries. The postmaster is signaled third time when the recovery has ended,
so that postmaster knows that it's safe to start accepting connections.

The startup process now traps SIGTERM, and performs a "clean" shutdown. If
you do a fast shutdown during recovery, a shutdown restartpoint is performed,
like a shutdown checkpoint, and postmaster kills the processes cleanly. You
still have to continue the recovery at next startup, though.

Currently, the background writer is only launched during archive recovery.
We could launch it during crash recovery as well, but it seems better to keep
that codepath as simple as possible, for the sake of robustness. And it
couldn't do any restartpoints during crash recovery anyway, so it wouldn't be
that useful.

log_restartpoints is gone. Use log_checkpoints instead. This is yet to be
documented.

This whole operation is a pre-requisite for Hot Standby, but has some value of
its own whether the hot standby patch makes 8.4 or not.

Simon Riggs, with lots of modifications by me.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.15 2008/11/06 20:51:14 tgl Exp $
d240 6
a245 1
seqscan UPDATE and DELETE use the bulkwrite strategy?)
@


1.15
log
@Improve bulk-insert performance by keeping the current target buffer pinned
(but not locked, as that would risk deadlocks).  Also, make it work in a small
ring of buffers to avoid having bulk inserts trash the whole buffer arena.

Robert Haas, after an idea of Simon Riggs'.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.14 2008/03/21 13:23:28 momjian Exp $
d271 5
@


1.14
log
@More README src cleanups.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.13 2008/03/20 17:55:15 momjian Exp $
d238 4
@


1.13
log
@Make source code READMEs more consistent.  Add CVS tags to all README files.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.12 2007/05/30 20:11:58 tgl Exp $
d4 1
a4 1
--------------------------------------
@


1.12
log
@Make large sequential scans and VACUUMs work in a limited-size "ring" of
buffers, rather than blowing out the whole shared-buffer arena.  Aside from
avoiding cache spoliation, this fixes the problem that VACUUM formerly tended
to cause a WAL flush for every page it modified, because we had it hacked to
use only a single buffer.  Those flushes will now occur only once per
ring-ful.  The exact ring size, and the threshold for seqscans to switch into
the ring usage pattern, remain under debate; but the infrastructure seems
done.  The key bit of infrastructure is a new optional BufferAccessStrategy
object that can be passed to ReadBuffer operations; this replaces the former
StrategyHintVacuum API.

This patch also changes the buffer usage-count methodology a bit: we now
advance usage_count when first pinning a buffer, rather than when last
unpinning it.  To preserve the behavior that a buffer's lifetime starts to
decrease when it's released, the clock sweep code is modified to not decrement
usage_count of pinned buffers.

Work not done in this commit: teach GiST and GIN indexes to use the vacuum
BufferAccessStrategy for vacuum-driven fetches.

Original patch by Simon, reworked by Heikki and again by Tom.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.11 2006/07/23 03:07:58 tgl Exp $
d3 1
a3 1
Notes about shared buffer access rules
d95 1
a95 1
Buffer manager's internal locking
d155 1
a155 1
Normal buffer replacement strategy
d204 1
a204 1
Buffer ring replacement strategy
d239 1
a239 1
Background writer's processing
@


1.11
log
@Split the buffer mapping table into multiple separately lockable
partitions, as per discussion.  Passes functionality checks, but
I don't have any performance data yet.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.10 2006/06/08 14:58:33 tgl Exp $
d155 2
a156 2
Buffer replacement strategy
---------------------------
d160 10
a169 9
always in this list.  We may also throw buffers into this list if we
consider their pages unlikely to be needed soon.  The list is singly-linked
using fields in the buffer headers; we maintain head and tail pointers in
global variables.  (Note: although the list links are in the buffer headers,
they are considered to be protected by the BufFreelistLock, not the
buffer-header spinlocks.)  To choose a victim buffer to recycle when there
are no free buffers available, we use a simple clock-sweep algorithm, which
avoids the need to take system-wide locks during common operations.  It
works like this:
d203 34
a236 16
A special provision is that while running VACUUM, a backend does not
increment the usage count on buffers it accesses.  In fact, if ReleaseBuffer
sees that it is dropping the pin count to zero and the usage count is zero,
then it appends the buffer to the tail of the free list.  (This implies that
VACUUM, but only VACUUM, must take the BufFreelistLock during ReleaseBuffer;
this shouldn't create much of a contention problem.)  This provision
encourages VACUUM to work in a relatively small number of buffers rather
than blowing out the entire buffer cache.  It is reasonable since a page
that has been touched only by VACUUM is unlikely to be needed again soon.

Since VACUUM usually requests many pages very fast, the effect of this is that
it will get back the very buffers it filled and possibly modified on the next
call and will therefore do its work in a few shared memory buffers, while
being able to use whatever it finds in the cache already.  This also implies
that most of the write traffic caused by a VACUUM will be done by the VACUUM
itself and not pushed off onto other processes.
@


1.10
log
@Remove obsolete comment about VACUUM FULL: it takes buffer content locks
now, and must do so to ensure bgwriter doesn't write a page that is in
process of being compacted.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.9 2006/03/31 23:32:06 tgl Exp $
d117 8
@


1.9
log
@Clean up WAL/buffer interactions as per my recent proposal.  Get rid of the
misleadingly-named WriteBuffer routine, and instead require routines that
change buffer pages to call MarkBufferDirty (which does exactly what it says).
We also require that they do so before calling XLogInsert; this takes care of
the synchronization requirement documented in SyncOneBuffer.  Note that
because bufmgr takes the buffer content lock (in shared mode) while writing
out any buffer, it doesn't matter whether MarkBufferDirty is executed before
the buffer content change is complete, so long as the content change is
completed before releasing exclusive lock on the buffer.  So it's OK to set
the dirtybit before we fill in the LSN.
This eliminates the former kluge of needing to set the dirtybit in LockBuffer.
Aside from making the code more transparent, we can also add some new
debugging assertions, in particular that the caller of MarkBufferDirty must
hold the buffer content lock, not merely a pin.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.8 2005/03/04 20:21:06 tgl Exp $
d81 1
a81 5
VACUUM FULL ignores rule #5, because it instead acquires exclusive lock at
the relation level, which ensures indirectly that no one else is accessing
pages of the relation at all.

Plain (concurrent) VACUUM must respect rule #5 fully.  Obtaining the
d234 6
@


1.8
log
@Replace the BufMgrLock with separate locks on the lookup hashtable and
the freelist, plus per-buffer spinlocks that protect access to individual
shared buffer headers.  This requires abandoning a global freelist (since
the freelist is a global contention point), which shoots down ARC and 2Q
as well as plain LRU management.  Adopt a clock sweep algorithm instead.
Preliminary results show substantial improvement in multi-backend situations.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.7 2004/04/19 23:27:17 tgl Exp $
d15 12
a26 13
so touching it is unsafe.  Typically a pin is acquired via ReadBuffer and
released via WriteBuffer (if one modified the page) or ReleaseBuffer (if not).
It is OK and indeed common for a single backend to pin a page more than
once concurrently; the buffer manager handles this efficiently.  It is
considered OK to hold a pin for long intervals --- for example, sequential
scans hold a pin on the current page until done processing all the tuples
on the page, which could be quite a while if the scan is the outer scan of
a join.  Similarly, btree index scans hold a pin on the current index page.
This is OK because normal operations never wait for a page's pin count to
drop to zero.  (Anything that might need to do such a wait is instead
handled by waiting to obtain the relation-level lock, which is why you'd
better hold one first.)  Pins may not be held across transaction
boundaries, however.
@


1.7
log
@Code review for ARC patch.  Eliminate static variables, improve handling
of VACUUM cases so that VACUUM requests don't affect the ARC state at all,
avoid corner case where BufferSync would uselessly rewrite a buffer that
no longer contains the page that was to be flushed.  Make some minor
other cleanups in and around the bufmgr as well, such as moving PinBuffer
and UnpinBuffer into bufmgr.c where they really belong.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql-server/src/backend/storage/buffer/README,v 1.6 2003/11/29 19:51:56 pgsql Exp $
d7 3
a9 3
reference counts (a/k/a pin counts) and buffer locks.  (Actually, there's
a third level of access control: one must hold the appropriate kind of
lock on a relation before one can legally access any page belonging to
d29 1
a29 1
Buffer locks: there are two kinds of buffer locks, shared and exclusive,
d41 2
a42 2
exclusive lock.  To examine the commit status (XIDs and status bits) of
a tuple in a shared buffer, one must likewise hold a pin and either shared
d46 1
a46 1
current transaction) one may drop the buffer lock, yet continue to access
d55 1
a55 1
one must hold a pin and an exclusive lock on the containing buffer.
d57 1
a57 1
tuple.
d79 1
a79 1
or exclusive lock.
d100 2
a101 43
Buffer replacement strategy interface
-------------------------------------

The file freelist.c contains the buffer cache replacement strategy.
The interface to the strategy is:

	BufferDesc *StrategyBufferLookup(BufferTag *tagPtr, bool recheck,
	                                 int *cdb_found_index)

This is always the first call made by the buffer manager to check if a disk
page is in memory. If so, the function returns the buffer descriptor and no
further action is required. If the page is not in memory,
StrategyBufferLookup() returns NULL.

The flag recheck tells the strategy that this is a second lookup after
flushing a dirty block. If the buffer manager has to evict another buffer,
it will release the bufmgr lock while doing the write IO. During this time,
another backend could possibly fault in the same page this backend is after,
so we have to check again after the IO is done if the page is in memory now.

*cdb_found_index is set to the index of the found CDB, or -1 if none.
This is not intended to be used by the caller, except to pass to
StrategyReplaceBuffer().

	BufferDesc *StrategyGetBuffer(int *cdb_replace_index)

The buffer manager calls this function to get an unpinned cache buffer whose
content can be evicted. The returned buffer might be empty, clean or dirty.

The returned buffer is only a candidate for replacement.  It is possible that
while the buffer is being written, another backend finds and modifies it, so
that it is dirty again.  The buffer manager will then have to call
StrategyGetBuffer() again to ask for another candidate.

*cdb_replace_index is set to the index of the candidate CDB, or -1 if none
(meaning we are using a previously free buffer).  This is not intended to be
used by the caller, except to pass to StrategyReplaceBuffer().

	void StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag,
	                           int cdb_found_index, int cdb_replace_index)

Called by the buffer manager at the time it is about to change the association
of a buffer with a disk page.
d103 47
a149 25
Before this call, StrategyBufferLookup() still has to find the buffer under
its old tag, even if it was returned by StrategyGetBuffer() as a candidate
for replacement.

After this call, this buffer must be returned for a lookup of the new page
identified by *newTag.

cdb_found_index and cdb_replace_index must be the auxiliary values
returned by previous calls to StrategyBufferLookup and StrategyGetBuffer.

	void StrategyInvalidateBuffer(BufferDesc *buf)

Called by the buffer manager to inform the strategy that the content of this
buffer is being thrown away. This happens for example in the case of dropping
a relation.  The buffer must be clean and unpinned on call.

If the buffer was associated with a disk page, StrategyBufferLookup()
must not return it for this page after the call.

	void StrategyHintVacuum(bool vacuum_active)

Because VACUUM reads all relations of the entire database through the buffer
manager, it can greatly disturb the buffer replacement strategy. This function
is used by VACUUM to inform the strategy that subsequent buffer lookups are
(or are not) caused by VACUUM scanning relations.
d155 53
a207 4
The buffer replacement strategy actually used in freelist.c is a version of
the Adaptive Replacement Cache (ARC) specially tailored for PostgreSQL.

The algorithm works as follows:
a208 61
C is the size of the cache in number of pages (a/k/a shared_buffers or
NBuffers).  ARC uses 2*C Cache Directory Blocks (CDB). A cache directory block
is always associated with one unique file page.  It may point to one shared
buffer, or may indicate that the file page is not in a buffer but has been
accessed recently.

All CDB entries are managed in 4 LRU lists named T1, T2, B1 and B2. The T1 and
T2 lists are the "real" cache entries, linking a file page to a memory buffer
where the page is currently cached. Consequently T1len+T2len <= C. B1 and B2
are ghost cache directories that extend T1 and T2 so that the strategy
remembers pages longer. The strategy tries to keep B1len+T1len and B2len+T2len
both at C. T1len and T2len vary over the runtime depending on the lookup
pattern and its resulting cache hits. The desired size of T1len is called
T1target.

Assuming we have a full cache, one of 5 cases happens on a lookup:

MISS	On a cache miss, depending on T1target and the actual T1len
	the LRU buffer of either T1 or T2 is evicted. Its CDB is removed
	from the T list and added as MRU of the corresponding B list.
	The now free buffer is replaced with the requested page
	and added as MRU of T1.

T1 hit	The T1 CDB is moved to the MRU position of the T2 list.

T2 hit	The T2 CDB is moved to the MRU position of the T2 list.

B1 hit	This means that a buffer that was evicted from the T1
	list is now requested again, indicating that T1target is
	too small (otherwise it would still be in T1 and thus in
	memory). The strategy raises T1target, evicts a buffer
	depending on T1target and T1len and places the CDB at
	MRU of T2.

B2 hit	This means the opposite of B1, the T2 list is probably too
	small. So the strategy lowers T1target, evicts a buffer
	and places the CDB at MRU of T2.

Thus, every page that is found on lookup in any of the four lists
ends up as the MRU of the T2 list. The T2 list therefore is the
"frequency" cache, holding frequently requested pages.

Every page that is seen for the first time ends up as the MRU of the T1
list. The T1 list is the "recency" cache, holding recent newcomers.

The tailoring done for PostgreSQL has to do with the way the query executor
works. A typical UPDATE or DELETE first scans the relation, searching for the
tuples and then calls heap_update() or heap_delete(). This causes at least 2
lookups for the block in the same statement. In the case of multiple matches
in one block even more often. As a result, every block touched in an UPDATE or
DELETE would directly jump into the T2 cache, which is wrong. To prevent this
the strategy remembers which transaction added a buffer to the T1 list and
will not promote it from there into the T2 cache during the same transaction.

Another specialty is the change of the strategy during VACUUM.  Lookups during
VACUUM do not represent application needs, and do not suggest that the page
will be hit again soon, so it would be wrong to change the cache balance
T1target due to that or to cause massive cache evictions. Therefore, a page
read in to satisfy vacuum is placed at the LRU position of the T1 list, for
immediate reuse.  Also, if we happen to get a hit on a CDB entry during
VACUUM, we do not promote the page above its current position in the list.
d215 24
@


1.7.4.1
log
@Replace ARC cache management algorithm with the similar but slightly
simpler 2Q algorithm, to avoid possible problems with the pending patent
on ARC.  Testing so far suggests that there is little if any performance
loss from doing this.

Note that this patch is going into the 8.0 branch only; a much more
extensive revision is planned for HEAD.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.7 2004/04/19 23:27:17 tgl Exp $
d175 1
a175 1
the Two Queue (2Q) algorithm specially tailored for PostgreSQL.
d179 14
a192 6
There are three lists of Cache Directory Blocks (CDBs), T1, T2, and B1.  The
T1 and T2 lists are the "real" cache entries, linking a file page to a memory
buffer where the page is currently cached. Consequently T1len+T2len <=
NBuffers.  B1 is a list of pages that are not currently buffered but recently
were.  We allocate a total of 1.5*NBuffers CDBs, so the maximum length of B1
when cache is fully used is NBuffers/2.  The target size of T1 is NBuffers/4.
d194 1
a194 1
Assuming we have a full cache, one of 4 cases happens on a lookup:
d198 1
a198 1
	from the T list and added as MRU of B1.
d206 10
a215 4
B1 hit	This means that a page that was recently evicted from cache
	is now requested again.  We evict a buffer the same as in
	the normal cache miss case, but the reloaded page goes to the
	MRU of T2 rather than T1.
d217 1
a217 1
Thus, every page that is found on lookup in any of the three lists
d235 2
a236 1
will be hit again soon. Therefore, a page
@


1.6
log
@
$Header: -> $PostgreSQL Changes ...
@
text
@d1 1
a1 1
$PostgreSQL: /cvsroot/pgsql-server/src/backend/storage/buffer/README,v 1.5 2003/11/14 04:32:11 wieck Exp $
d100 2
a101 1
Buffer replacement strategy interface:
d103 2
a104 2
The two files freelist.c and buf_table.c contain the buffer cache
replacement strategy. The interface to the strategy is:
d106 2
a107 2
    BufferDesc *
	StrategyBufferLookup(BufferTag *tagPtr, bool recheck)
d109 67
a175 70
		This is allways the first call made by the buffer manager
		to check if a disk page is in memory. If so, the function
		returns the buffer descriptor and no further action is
		required.

		If the page is not in memory, StrategyBufferLookup()
		returns NULL.

		The flag recheck tells the strategy that this is a second
		lookup after flushing a dirty block. If the buffer manager
		has to evict another buffer, he will release the bufmgr lock
		while doing the write IO. During this time, another backend
		could possibly fault in the same page this backend is after,
		so we have to check again after the IO is done if the page
		is in memory now.

	BufferDesc *
	StrategyGetBuffer(void)

		The buffer manager calls this function to get an unpinned
		cache buffer who's content can be evicted. The returned
		buffer might be empty, clean or dirty.

		The returned buffer is only a cadidate for replacement.
		It is possible that while the buffer is written, another
		backend finds and modifies it, so that it is dirty again.
		The buffer manager will then call StrategyGetBuffer()
		again to ask for another candidate.

	void
	StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, 
			BlockNumber blockNum)
		
		Called by the buffer manager at the time it is about to
		change the association of a buffer with a disk page.

		Before this call, StrategyBufferLookup() still has to find
		the buffer even if it was returned by StrategyGetBuffer()
		as a candidate for replacement.

		After this call, this buffer must be returned for a
		lookup of the new page identified by rnode and blockNum.

	void
	StrategyInvalidateBuffer(BufferDesc *buf)

		Called from various parts to inform that the content of
		this buffer has been thrown away. This happens for example
		in the case of dropping a relation.

		The buffer must be clean and unpinned on call.

		If the buffer associated with a disk page, StrategyBufferLookup()
		must not return it for this page after the call.

	void
	StrategyHintVacuum(bool vacuum_active)

		Because vacuum reads all relations of the entire database
		through the buffer manager, it can greatly disturb the
		buffer replacement strategy. This function is used by vacuum
		to inform that all subsequent buffer lookups are caused
		by vacuum scanning relations.

		
Buffer replacement strategy:

The buffer replacement strategy actually used in freelist.c is a
version of the Adaptive Replacement Cache (ARC) special tailored for
PostgreSQL.
d179 67
a245 69
    C is the size of the cache in number of pages (conf: shared_buffers)
	ARC uses 2*C Cache Directory Blocks (CDB). A cache directory block
	is allwayt associated with one unique file page and "can" point to
	one shared buffer.

	All file pages known in by the directory are managed in 4 LRU lists
	named B1, T1, T2 and B2. The T1 and T2 lists are the "real" cache
	entries, linking a file page to a memory buffer where the page is
	currently cached. Consequently T1len+T2len <= C. B1 and B2 are
	ghost cache directories that extend T1 and T2 so that the strategy
	remembers pages longer. The strategy tries to keep B1len+T1len and
	B2len+T2len both at C. T1len and T2 len vary over the runtime
	depending on the lookup pattern and its resulting cache hits. The
	desired size of T1len is called T1target.

	Assuming we have a full cache, one of 5 cases happens on a lookup:

	MISS	On a cache miss, depending on T1target and the actual T1len
			the LRU buffer of T1 or T2 is evicted. Its CDB is removed
			from the T list and added as MRU of the corresponding B list.
			The now free buffer is replaced with the requested page
			and added as MRU of T1.

	T1 hit	The T1 CDB is moved to the MRU position of the T2 list.

	T2 hit	The T2 CDB is moved to the MRU position of the T2 list.

	B1 hit	This means that a buffer that was evicted from the T1
			list is now requested again, indicating that T1target is
			too small (otherwise it would still be in T1 and thus in
			memory). The strategy raises T1target, evicts a buffer
			depending on T1target and T1len and places the CDB at
			MRU of T2.

	B2 hit	This means the opposite of B1, the T2 list is probably too
			small. So the strategy lowers T1target, evicts a buffer
			and places the CDB at MRU of T2.

	Thus, every page that is found on lookup in any of the four lists
	ends up as the MRU of the T2 list. The T2 list therefore is the
	"frequency" cache, holding frequently requested pages.

	Every page that is seen for the first time ends up as the MRU of
	the T1 list. The T1 list is the "recency" cache, holding recent
	newcomers.

	The tailoring done for PostgreSQL has to do with the way, the
	query executor works. A typical UPDATE or DELETE first scans the 
	relation, searching for the tuples and then calls heap_update() or
	heap_delete(). This causes at least 2 lookups for the block in the
	same statement. In the case of multiple matches in one block even
	more often. As a result, every block touched in an UPDATE or DELETE
	would directly jump into the T2 cache, which is wrong. To prevent
	this the strategy remembers which transaction added a buffer to the
	T1 list and will not promote it from there into the T2 cache during
	the same transaction.
	
	Another specialty is the change of the strategy during VACUUM.
	Lookups during VACUUM do not represent application needs, so it
	would be wrong to change the cache balance T1target due to that
	or to cause massive cache evictions. Therefore, a page read in to
	satisfy vacuum (not those that actually cause a hit on any list)
	is placed at the LRU position of the T1 list, for immediate
	reuse. Since Vacuum usually requests many pages very fast, the
	natural side effect of this is that it will get back the very
	buffers it filled and possibly modified on the next call and will
	therefore do it's work in a few shared memory buffers, while using
	whatever it finds in the cache already.

@


1.5
log
@Added documentation for the new interface between the buffer manager
and the cache replacement strategy as well as a description of the
ARC algorithm and the special tailoring of that done for PostgreSQL.

Jan
@
text
@d1 1
a1 1
$Header: /cvsroot/pgsql-server/src/backend/storage/buffer/README,v 1.4 2003/10/31 22:48:08 tgl Exp $
@


1.4
log
@Update future-tense comments in README to present tense.  Noted by
Neil Conway.
@
text
@d1 1
a1 1
$Header: /cvsroot/pgsql-server/src/backend/storage/buffer/README,v 1.3 2001/09/29 04:02:22 tgl Exp $
d98 152
@


1.3
log
@Implement new 'lightweight lock manager' that's intermediate between
existing lock manager and spinlocks: it understands exclusive vs shared
lock but has few other fancy features.  Replace most uses of spinlocks
with lightweight locks.  All remaining uses of spinlocks have very short
lock hold times (a few dozen instructions), so tweak spinlock backoff
code to work efficiently given this assumption.  All per my proposal on
pghackers 26-Sep-01.
@
text
@d1 1
a1 1
$Header: /cvsroot/pgsql/src/backend/storage/buffer/README,v 1.2 2001/08/25 18:52:42 tgl Exp $
d82 16
a97 17
As of 7.1, the only operation that removes tuples or compacts free space is
(oldstyle) VACUUM.  It does not have to implement rule #5 directly, because
it instead acquires exclusive lock at the relation level, which ensures
indirectly that no one else is accessing pages of the relation at all.

To implement concurrent VACUUM we will need to make it obey rule #5 fully.
To do this, we'll create a new buffer manager operation
LockBufferForCleanup() that gets an exclusive lock and then checks to see
if the shared pin count is currently 1.  If not, it releases the exclusive
lock (but not the caller's pin) and waits until signaled by another backend,
whereupon it tries again.  The signal will occur when UnpinBuffer
decrements the shared pin count to 1.  As indicated above, this operation
might have to wait a good while before it acquires lock, but that shouldn't
matter much for concurrent VACUUM.  The current implementation only
supports a single waiter for pin-count-1 on any particular shared buffer.
This is enough for VACUUM's use, since we don't allow multiple VACUUMs
concurrently on a single relation anyway.
@


1.2
log
@Replace implementation of pg_log as a relation accessed through the
buffer manager with 'pg_clog', a specialized access method modeled
on pg_xlog.  This simplifies startup (don't need to play games to
open pg_log; among other things, OverrideTransactionSystem goes away),
should improve performance a little, and opens the door to recycling
commit log space by removing no-longer-needed segments of the commit
log.  Actual recycling is not there yet, but I felt I should commit
this part separately since it'd still be useful if we chose not to
do transaction ID wraparound.
@
text
@d1 1
a1 1
$Header: /home/projects/pgsql/cvsroot/pgsql/src/backend/storage/buffer/README,v 1.1 2001/07/06 21:04:25 tgl Exp $
d33 4
a36 6
and WRITE locks.)  These locks are short-term: they should not be held for
long.  They are implemented as per-buffer spinlocks, so another backend
trying to acquire a competing lock will spin as long as you hold yours!
Buffer locks are acquired and released by LockBuffer().  It will *not* work
for a single backend to try to acquire multiple locks on the same buffer.
One must pin a buffer before trying to lock it.
@


1.1
log
@Implement LockBufferForCleanup(), which will allow concurrent VACUUM
to wait until it's safe to remove tuples and compact free space in a
shared buffer page.  Miscellaneous small code cleanups in bufmgr, too.
@
text
@d1 1
a1 1
$Header$
d69 1
a69 1
(they cache the results of transaction status lookups in pg_log), so no
@
