head	1.25;
access;
symbols
	REL9_0_0:1.25
	REL9_1_ALPHA1:1.25
	REL9_0_RC1:1.25
	REL9_0_BETA4:1.25
	REL9_0_STABLE:1.25.0.8
	REL9_0_BETA3:1.25
	REL9_0_BETA2:1.25
	REL7_4_29:1.13
	REL8_0_25:1.15
	REL8_1_21:1.17
	REL8_2_17:1.21
	REL8_3_11:1.22
	REL8_4_4:1.24
	REL9_0_BETA1:1.25
	REL9_0_ALPHA5_BRANCH:1.25.0.6
	REL9_0_ALPHA5:1.25
	REL7_4_28:1.13
	REL8_0_24:1.15
	REL8_1_20:1.17
	REL8_2_16:1.21
	REL8_3_10:1.22
	REL8_4_3:1.24
	REL9_0_ALPHA4:1.25
	REL9_0_ALPHA4_BRANCH:1.25.0.4
	REL8_5_ALPHA3:1.25
	REL8_5_ALPHA3_BRANCH:1.25.0.2
	REL7_4_27:1.13
	REL8_0_23:1.15
	REL8_1_19:1.17
	REL8_2_15:1.21
	REL8_3_9:1.22
	REL8_4_2:1.24
	REL8_5_ALPHA2:1.24
	REL8_5_ALPHA2_BRANCH:1.24.0.6
	REL7_4_26:1.13
	REL8_0_22:1.15
	REL8_1_18:1.17
	REL8_2_14:1.21
	REL8_3_8:1.22
	REL8_4_1:1.24
	REL8_5_ALPHA1:1.24
	REL8_5_ALPHA1_BRANCH:1.24.0.4
	REL8_4_STABLE:1.24.0.2
	REL8_4_0:1.24
	REL8_4_RC2:1.24
	REL8_4_RC1:1.24
	REL8_4_BETA2:1.24
	REL8_4_BETA1:1.24
	REL7_4_25:1.13
	REL8_0_21:1.15
	REL8_1_17:1.17
	REL8_2_13:1.21
	REL8_3_7:1.22
	REL7_4_24:1.13
	REL8_0_20:1.15
	REL8_1_16:1.17
	REL8_2_12:1.21
	REL8_3_6:1.22
	REL7_4_23:1.13
	REL8_0_19:1.15
	REL8_1_15:1.17
	REL8_2_11:1.21
	REL8_3_5:1.22
	REL7_4_22:1.13
	REL8_0_18:1.15
	REL8_1_14:1.17
	REL8_2_10:1.21
	REL8_3_4:1.22
	REL7_4_21:1.13
	REL8_0_17:1.15
	REL8_1_13:1.17
	REL8_2_9:1.21
	REL8_3_3:1.22
	REL7_4_20:1.13
	REL8_0_16:1.15
	REL8_1_12:1.17
	REL8_2_8:1.21
	REL8_3_2:1.22
	REL8_2_7:1.21
	REL8_3_1:1.22
	REL8_3_STABLE:1.22.0.2
	REL8_3_0:1.22
	REL8_3_RC2:1.22
	REL7_3_21:1.12
	REL7_4_19:1.13
	REL8_0_15:1.15
	REL8_1_11:1.17
	REL8_2_6:1.21
	REL8_3_RC1:1.22
	REL8_3_BETA4:1.22
	REL8_3_BETA3:1.22
	REL8_3_BETA2:1.22
	REL8_3_BETA1:1.21
	REL7_3_20:1.12
	REL7_4_18:1.13
	REL8_0_14:1.15
	REL8_1_10:1.17
	REL8_2_5:1.21
	REL7_3_19:1.12
	REL7_4_17:1.13
	REL8_0_13:1.15
	REL8_1_9:1.17
	REL8_2_4:1.21
	REL8_0_12:1.15
	REL8_1_8:1.17
	REL8_2_3:1.21
	REL7_3_18:1.12
	REL7_4_16:1.13
	REL8_0_11:1.15
	REL8_1_7:1.17
	REL8_2_2:1.21
	REL8_0_10:1.15
	REL8_1_6:1.17
	REL8_2_1:1.21
	REL7_4_15:1.13
	REL7_3_17:1.12
	REL8_2_STABLE:1.21.0.2
	REL8_2_0:1.21
	REL8_2_RC1:1.21
	REL8_2_BETA3:1.21
	REL8_2_BETA2:1.21
	REL8_1_5:1.17
	REL8_0_9:1.15
	REL7_4_14:1.13
	REL7_3_16:1.12
	REL8_2_BETA1:1.21
	REL7_3_15:1.12
	REL7_4_13:1.13
	REL8_0_8:1.15
	REL8_1_4:1.17
	REL7_3_14:1.12
	REL7_4_12:1.13
	REL8_0_7:1.15
	REL8_1_3:1.17
	REL7_3_13:1.12
	REL7_4_11:1.13
	REL8_0_6:1.15
	REL8_1_2:1.17
	REL7_3_12:1.12
	REL7_4_10:1.13
	REL8_0_5:1.15
	REL8_1_1:1.17
	REL8_1_STABLE:1.17.0.2
	REL8_1_0:1.17
	REL8_1_0RC1:1.17
	REL8_1_0BETA4:1.17
	REL8_1_0BETA3:1.17
	REL7_3_11:1.12
	REL7_4_9:1.13
	REL8_0_4:1.15
	REL8_1_0BETA2:1.17
	REL8_1_0BETA1:1.17
	REL7_2_8:1.9
	REL7_3_10:1.12
	REL7_4_8:1.13
	REL8_0_3:1.15
	REL8_0_2:1.15
	REL7_2_7:1.9
	REL7_3_9:1.12
	REL7_4_7:1.13
	REL8_0_1:1.15
	REL8_0_STABLE:1.15.0.4
	REL8_0_0:1.15.0.2
	REL8_0_0RC5:1.15
	REL8_0_0RC4:1.15
	REL8_0_0RC3:1.15
	REL8_0_0RC2:1.15
	REL8_0_0RC1:1.15
	REL8_0_0BETA5:1.15
	REL8_0_0BETA4:1.15
	REL7_4_6:1.13
	REL7_3_8:1.12
	REL7_2_6:1.9
	REL8_0_0BETA3:1.15
	REL8_0_0BETA2:1.15
	REL7_2_5:1.9
	REL7_4_5:1.13
	REL7_3_7:1.12
	REL7_4_4:1.13
	REL8_0_0BETA1:1.14
	REL7_4_3:1.13
	REL7_4_2:1.13
	REL7_3_6:1.12
	REL7_4_1:1.13
	REL7_3_5:1.12
	REL7_4:1.13
	REL7_4_RC2:1.13
	REL7_4_STABLE:1.13.0.4
	REL7_4_RC1:1.13
	REL7_4_BETA5:1.13
	REL7_4_BETA4:1.13
	REL7_4_BETA3:1.13
	REL7_4_BETA2:1.13
	WIN32_DEV:1.13.0.2
	REL7_4_BETA1:1.13
	REL7_3_4:1.12
	REL7_3_2:1.12
	REL7_2_4:1.9
	REL7_3_STABLE:1.12.0.2
	REL7_2_3:1.9
	REL7_2_STABLE:1.9.0.2
	REL7_2:1.9
	REL7_2_RC2:1.9
	REL7_2_RC1:1.9
	REL7_2_BETA5:1.9
	REL7_2_BETA4:1.9
	REL7_2_BETA3:1.9
	REL7_2_BETA2:1.9
	REL7_2_BETA1:1.9
	REL7_1_2:1.8
	REL7_1_STABLE:1.8.0.2
	REL7_1_BETA:1.3
	REL7_1_BETA3:1.4
	REL7_1_BETA2:1.4
	REL7_1:1.8
	REL7_0_PATCHES:1.3.0.6
	REL7_0:1.3
	REL6_5_PATCHES:1.3.0.4
	REL6_5:1.3
	REL6_4:1.3.0.2
	release-6-3:1.2
	REL2_0B:1.1.1.1.0.4
	REL2_0:1.1.1.1
	Release_2_0_0:1.1.1.1
	Release_1_0_3:1.1.1.1.0.2
	Release_2_0:1.1.1.1
	Release_1_0_2:1.1.1.1
	PG95-1_01:1.1.1.1
	PG95_DIST:1.1.1;
locks; strict;
comment	@# @;


1.25
date	2009.12.19.01.32.35;	author sriggs;	state Exp;
branches;
next	1.24;

1.24
date	2008.03.21.13.23.28;	author momjian;	state Exp;
branches;
next	1.23;

1.23
date	2008.03.20.17.55.15;	author momjian;	state Exp;
branches;
next	1.22;

1.22
date	2007.10.26.20.45.10;	author alvherre;	state Exp;
branches;
next	1.21;

1.21
date	2006.09.18.22.40.36;	author tgl;	state Exp;
branches;
next	1.20;

1.20
date	2006.07.23.23.08.46;	author tgl;	state Exp;
branches;
next	1.19;

1.19
date	2005.12.11.21.02.18;	author tgl;	state Exp;
branches;
next	1.18;

1.18
date	2005.12.09.01.22.04;	author tgl;	state Exp;
branches;
next	1.17;

1.17
date	2005.06.14.22.15.32;	author tgl;	state Exp;
branches;
next	1.16;

1.16
date	2005.04.29.22.28.24;	author tgl;	state Exp;
branches;
next	1.15;

1.15
date	2004.08.27.17.07.41;	author tgl;	state Exp;
branches;
next	1.14;

1.14
date	2003.11.29.19.51.56;	author pgsql;	state Exp;
branches;
next	1.13;

1.13
date	2003.02.18.03.33.50;	author momjian;	state Exp;
branches;
next	1.12;

1.12
date	2002.10.31.21.34.16;	author tgl;	state Exp;
branches;
next	1.11;

1.11
date	2002.07.19.00.17.40;	author momjian;	state Exp;
branches;
next	1.10;

1.10
date	2002.04.15.23.46.13;	author momjian;	state Exp;
branches;
next	1.9;

1.9
date	2001.09.29.04.02.24;	author tgl;	state Exp;
branches;
next	1.8;

1.8
date	2001.01.26.18.23.12;	author tgl;	state Exp;
branches;
next	1.7;

1.7
date	2001.01.25.03.31.16;	author tgl;	state Exp;
branches;
next	1.6;

1.6
date	2001.01.22.22.30.06;	author tgl;	state Exp;
branches;
next	1.5;

1.5
date	2001.01.16.06.11.34;	author tgl;	state Exp;
branches;
next	1.4;

1.4
date	2000.12.22.00.51.54;	author tgl;	state Exp;
branches;
next	1.3;

1.3
date	98.07.06.18.16.07;	author momjian;	state Exp;
branches;
next	1.2;

1.2
date	98.01.28.02.29.26;	author momjian;	state Exp;
branches;
next	1.1;

1.1
date	96.07.09.06.21.55;	author scrappy;	state Exp;
branches
	1.1.1.1;
next	;

1.1.1.1
date	96.07.09.06.21.55;	author scrappy;	state Exp;
branches;
next	;


desc
@@


1.25
log
@Allow read only connections during recovery, known as Hot Standby.

Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.

New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.

This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.

Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.

Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
@
text
@$PostgreSQL: pgsql/src/backend/storage/lmgr/README,v 1.24 2008/03/21 13:23:28 momjian Exp $

Locking Overview
================

Postgres uses three types of interprocess locks:

* Spinlocks.  These are intended for *very* short-term locks.  If a lock
is to be held more than a few dozen instructions, or across any sort of
kernel call (or even a call to a nontrivial subroutine), don't use a
spinlock. Spinlocks are primarily used as infrastructure for lightweight
locks. They are implemented using a hardware atomic-test-and-set
instruction, if available.  Waiting processes busy-loop until they can
get the lock. There is no provision for deadlock detection, automatic
release on error, or any other nicety.  There is a timeout if the lock
cannot be gotten after a minute or so (which is approximately forever in
comparison to the intended lock hold time, so this is certainly an error
condition).

* Lightweight locks (LWLocks).  These locks are typically used to
interlock access to datastructures in shared memory.  LWLocks support
both exclusive and shared lock modes (for read/write and read-only
access to a shared object). There is no provision for deadlock
detection, but the LWLock manager will automatically release held
LWLocks during elog() recovery, so it is safe to raise an error while
holding LWLocks.  Obtaining or releasing an LWLock is quite fast (a few
dozen instructions) when there is no contention for the lock.  When a
process has to wait for an LWLock, it blocks on a SysV semaphore so as
to not consume CPU time.  Waiting processes will be granted the lock in
arrival order.  There is no timeout.

* Regular locks (a/k/a heavyweight locks).  The regular lock manager
supports a variety of lock modes with table-driven semantics, and it has
full deadlock detection and automatic release at transaction end. 
Regular locks should be used for all user-driven lock requests.

Acquisition of either a spinlock or a lightweight lock causes query
cancel and die() interrupts to be held off until all such locks are
released. No such restriction exists for regular locks, however.  Also
note that we can accept query cancel and die() interrupts while waiting
for a regular lock, but we will not accept them while waiting for
spinlocks or LW locks. It is therefore not a good idea to use LW locks
when the wait time might exceed a few seconds.

The rest of this README file discusses the regular lock manager in detail.


Lock Data Structures
--------------------

Lock methods describe the overall locking behavior.  Currently there are
two lock methods: DEFAULT and USER.

Lock modes describe the type of the lock (read/write or shared/exclusive).
In principle, each lock method can have its own set of lock modes with
different conflict rules, but currently DEFAULT and USER methods use
identical lock mode sets.  See src/tools/backend/index.html and
src/include/storage/lock.h for more details.  (Lock modes are also called
lock types in some places in the code and documentation.)

There are two fundamental lock structures in shared memory: the
per-lockable-object LOCK struct, and the per-lock-and-requestor PROCLOCK
struct.  A LOCK object exists for each lockable object that currently has
locks held or requested on it.  A PROCLOCK struct exists for each backend
that is holding or requesting lock(s) on each LOCK object.

In addition to these, each backend maintains an unshared LOCALLOCK structure
for each lockable object and lock mode that it is currently holding or
requesting.  The shared lock structures only allow a single lock grant to
be made per lockable object/lock mode/backend.  Internally to a backend,
however, the same lock may be requested and perhaps released multiple times
in a transaction, and it can also be held both transactionally and session-
wide.  The internal request counts are held in LOCALLOCK so that the shared
data structures need not be accessed to alter them.

---------------------------------------------------------------------------

The lock manager's LOCK objects contain:

tag -
    The key fields that are used for hashing locks in the shared memory
    lock hash table.  The contents of the tag essentially define an
    individual lockable object.  See include/storage/lock.h for details
    about the supported types of lockable objects.  This is declared as
    a separate struct to ensure that we always zero out the correct number
    of bytes.  It is critical that any alignment-padding bytes the compiler
    might insert in the struct be zeroed out, else the hash computation
    will be random.  (Currently, we are careful to define struct LOCKTAG
    so that there are no padding bytes.)

grantMask -
    This bitmask indicates what types of locks are currently held on the
    given lockable object.  It is used (against the lock table's conflict
    table) to determine if a new lock request will conflict with existing
    lock types held.  Conflicts are determined by bitwise AND operations
    between the grantMask and the conflict table entry for the requested
    lock type.  Bit i of grantMask is 1 if and only if granted[i] > 0.

waitMask -
    This bitmask shows the types of locks being waited for.  Bit i of waitMask
    is 1 if and only if requested[i] > granted[i].

procLocks -
    This is a shared memory queue of all the PROCLOCK structs associated with
    the lock object.  Note that both granted and waiting PROCLOCKs are in this
    list (indeed, the same PROCLOCK might have some already-granted locks and
    be waiting for more!).

waitProcs -
    This is a shared memory queue of all PGPROC structures corresponding to
    backends that are waiting (sleeping) until another backend releases this
    lock.  The process structure holds the information needed to determine
    if it should be woken up when the lock is released.

nRequested -
    Keeps a count of how many times this lock has been attempted to be
    acquired.  The count includes attempts by processes which were put
    to sleep due to conflicts.  It also counts the same backend twice
    if, for example, a backend process first acquires a read and then
    acquires a write.  (But multiple acquisitions of the same lock/lock mode
    within a backend are not multiply counted here; they are recorded
    only in the backend's LOCALLOCK structure.)

requested -
    Keeps a count of how many locks of each type have been attempted.  Only
    elements 1 through MAX_LOCKMODES-1 are used as they correspond to the lock
    type defined constants.  Summing the values of requested[] should come out
    equal to nRequested.

nGranted -
    Keeps count of how many times this lock has been successfully acquired.
    This count does not include attempts that are waiting due to conflicts.
    Otherwise the counting rules are the same as for nRequested.

granted -
    Keeps count of how many locks of each type are currently held.  Once again
    only elements 1 through MAX_LOCKMODES-1 are used (0 is not).  Also, like
    requested[], summing the values of granted[] should total to the value
    of nGranted.

We should always have 0 <= nGranted <= nRequested, and
0 <= granted[i] <= requested[i] for each i.  When all the request counts
go to zero, the LOCK object is no longer needed and can be freed.

---------------------------------------------------------------------------

The lock manager's PROCLOCK objects contain:

tag -
    The key fields that are used for hashing entries in the shared memory
    PROCLOCK hash table.  This is declared as a separate struct to ensure that
    we always zero out the correct number of bytes.  It is critical that any
    alignment-padding bytes the compiler might insert in the struct be zeroed
    out, else the hash computation will be random.  (Currently, we are careful
    to define struct PROCLOCKTAG so that there are no padding bytes.)

    tag.myLock
        Pointer to the shared LOCK object this PROCLOCK is for.

    tag.myProc
        Pointer to the PGPROC of backend process that owns this PROCLOCK.

    Note: it's OK to use pointers here because a PROCLOCK never outlives
    either its lock or its proc.  The tag is therefore unique for as long
    as it needs to be, even though the same tag values might mean something
    else at other times.

holdMask -
    A bitmask for the lock modes successfully acquired by this PROCLOCK.
    This should be a subset of the LOCK object's grantMask, and also a
    subset of the PGPROC object's heldLocks mask (if the PGPROC is
    currently waiting for another lock mode on this lock).

releaseMask -
    A bitmask for the lock modes due to be released during LockReleaseAll.
    This must be a subset of the holdMask.  Note that it is modified without
    taking the partition LWLock, and therefore it is unsafe for any
    backend except the one owning the PROCLOCK to examine/change it.

lockLink -
    List link for shared memory queue of all the PROCLOCK objects for the
    same LOCK.

procLink -
    List link for shared memory queue of all the PROCLOCK objects for the
    same backend.

---------------------------------------------------------------------------


Lock Manager Internal Locking
-----------------------------

Before PostgreSQL 8.2, all of the shared-memory data structures used by
the lock manager were protected by a single LWLock, the LockMgrLock;
any operation involving these data structures had to exclusively lock
LockMgrLock.  Not too surprisingly, this became a contention bottleneck.
To reduce contention, the lock manager's data structures have been split
into multiple "partitions", each protected by an independent LWLock.
Most operations only need to lock the single partition they are working in.
Here are the details:

* Each possible lock is assigned to one partition according to a hash of
its LOCKTAG value.  The partition's LWLock is considered to protect all the
LOCK objects of that partition as well as their subsidiary PROCLOCKs.

* The shared-memory hash tables for LOCKs and PROCLOCKs are organized
so that different partitions use different hash chains, and thus there
is no conflict in working with objects in different partitions.  This
is supported directly by dynahash.c's "partitioned table" mechanism
for the LOCK table: we need only ensure that the partition number is
taken from the low-order bits of the dynahash hash value for the LOCKTAG.
To make it work for PROCLOCKs, we have to ensure that a PROCLOCK's hash
value has the same low-order bits as its associated LOCK.  This requires
a specialized hash function (see proclock_hash).

* Formerly, each PGPROC had a single list of PROCLOCKs belonging to it.
This has now been split into per-partition lists, so that access to a
particular PROCLOCK list can be protected by the associated partition's
LWLock.  (This is not strictly necessary at the moment, because at this
writing a PGPROC's PROCLOCK list is only accessed by the owning backend
anyway.  But it seems forward-looking to maintain a convention for how
other backends could access it.  In any case LockReleaseAll needs to be
able to quickly determine which partition each LOCK belongs to, and
for the currently contemplated number of partitions, this way takes less
shared memory than explicitly storing a partition number in LOCK structs
would require.)

* The other lock-related fields of a PGPROC are only interesting when
the PGPROC is waiting for a lock, so we consider that they are protected
by the partition LWLock of the awaited lock.

For normal lock acquisition and release, it is sufficient to lock the
partition containing the desired lock.  Deadlock checking needs to touch
multiple partitions in general; for simplicity, we just make it lock all
the partitions in partition-number order.  (To prevent LWLock deadlock,
we establish the rule that any backend needing to lock more than one
partition at once must lock them in partition-number order.)  It's
possible that deadlock checking could be done without touching every
partition in typical cases, but since in a properly functioning system
deadlock checking should not occur often enough to be performance-critical,
trying to make this work does not seem a productive use of effort.

A backend's internal LOCALLOCK hash table is not partitioned.  We do store
a copy of the locktag hash code in LOCALLOCK table entries, from which the
partition number can be computed, but this is a straight speed-for-space
tradeoff: we could instead recalculate the partition number from the LOCKTAG
when needed.


The Deadlock Detection Algorithm
--------------------------------

Since we allow user transactions to request locks in any order, deadlock
is possible.  We use a deadlock detection/breaking algorithm that is
fairly standard in essence, but there are many special considerations
needed to deal with Postgres' generalized locking model.

A key design consideration is that we want to make routine operations
(lock grant and release) run quickly when there is no deadlock, and
avoid the overhead of deadlock handling as much as possible.  We do this
using an "optimistic waiting" approach: if a process cannot acquire the
lock it wants immediately, it goes to sleep without any deadlock check. 
But it also sets a delay timer, with a delay of DeadlockTimeout
milliseconds (typically set to one second).  If the delay expires before
the process is granted the lock it wants, it runs the deadlock
detection/breaking code. Normally this code will determine that there is
no deadlock condition, and then the process will go back to sleep and
wait quietly until it is granted the lock.  But if a deadlock condition
does exist, it will be resolved, usually by aborting the detecting
process' transaction.  In this way, we avoid deadlock handling overhead
whenever the wait time for a lock is less than DeadlockTimeout, while
not imposing an unreasonable delay of detection when there is an error.

Lock acquisition (routines LockAcquire and ProcSleep) follows these rules:

1. A lock request is granted immediately if it does not conflict with
any existing or waiting lock request, or if the process already holds an
instance of the same lock type (eg, there's no penalty to acquire a read
lock twice).  Note that a process never conflicts with itself, eg one
can obtain read lock when one already holds exclusive lock.

2. Otherwise the process joins the lock's wait queue.  Normally it will
be added to the end of the queue, but there is an exception: if the
process already holds locks on this same lockable object that conflict
with the request of any pending waiter, then the process will be
inserted in the wait queue just ahead of the first such waiter.  (If we
did not make this check, the deadlock detection code would adjust the
queue order to resolve the conflict, but it's relatively cheap to make
the check in ProcSleep and avoid a deadlock timeout delay in this case.)
 Note special case when inserting before the end of the queue: if the
process's request does not conflict with any existing lock nor any
waiting request before its insertion point, then go ahead and grant the
lock without waiting.

When a lock is released, the lock release routine (ProcLockWakeup) scans
the lock object's wait queue.  Each waiter is awoken if (a) its request
does not conflict with already-granted locks, and (b) its request does
not conflict with the requests of prior un-wakable waiters.  Rule (b)
ensures that conflicting requests are granted in order of arrival. There
are cases where a later waiter must be allowed to go in front of
conflicting earlier waiters to avoid deadlock, but it is not
ProcLockWakeup's responsibility to recognize these cases; instead, the
deadlock detection code will re-order the wait queue when necessary.

To perform deadlock checking, we use the standard method of viewing the
various processes as nodes in a directed graph (the waits-for graph or
WFG).  There is a graph edge leading from process A to process B if A
waits for B, ie, A is waiting for some lock and B holds a conflicting
lock.  There is a deadlock condition if and only if the WFG contains a
cycle.  We detect cycles by searching outward along waits-for edges to
see if we return to our starting point.  There are three possible
outcomes:

1. All outgoing paths terminate at a running process (which has no
outgoing edge).

2. A deadlock is detected by looping back to the start point.  We
resolve such a deadlock by canceling the start point's lock request and
reporting an error in that transaction, which normally leads to
transaction abort and release of that transaction's held locks.  Note
that it's sufficient to cancel one request to remove the cycle; we don't
need to kill all the transactions involved.

3. Some path(s) loop back to a node other than the start point.  This
indicates a deadlock, but one that does not involve our starting
process. We ignore this condition on the grounds that resolving such a
deadlock is the responsibility of the processes involved --- killing our
start- point process would not resolve the deadlock.  So, cases 1 and 3
both report "no deadlock".

Postgres' situation is a little more complex than the standard discussion
of deadlock detection, for two reasons:

1. A process can be waiting for more than one other process, since there
might be multiple PROCLOCKs of (non-conflicting) lock types that all
conflict with the waiter's request.  This creates no real difficulty
however; we simply need to be prepared to trace more than one outgoing
edge.

2. If a process A is behind a process B in some lock's wait queue, and
their requested locks conflict, then we must say that A waits for B, since
ProcLockWakeup will never awaken A before B.  This creates additional
edges in the WFG.  We call these "soft" edges, as opposed to the "hard"
edges induced by locks already held.  Note that if B already holds any
locks conflicting with A's request, then their relationship is a hard edge
not a soft edge.

A "soft" block, or wait-priority block, has the same potential for
inducing deadlock as a hard block.  However, we may be able to resolve
a soft block without aborting the transactions involved: we can instead
rearrange the order of the wait queue.  This rearrangement reverses the
direction of the soft edge between two processes with conflicting requests
whose queue order is reversed.  If we can find a rearrangement that
eliminates a cycle without creating new ones, then we can avoid an abort.
Checking for such possible rearrangements is the trickiest part of the
algorithm.

The workhorse of the deadlock detector is a routine FindLockCycle() which
is given a starting point process (which must be a waiting process).
It recursively scans outward across waits-for edges as discussed above.
If it finds no cycle involving the start point, it returns "false".
(As discussed above, we can ignore cycles not involving the start point.)
When such a cycle is found, FindLockCycle() returns "true", and as it
unwinds it also builds a list of any "soft" edges involved in the cycle.
If the resulting list is empty then there is a hard deadlock and the
configuration cannot succeed.  However, if the list is not empty, then
reversing any one of the listed edges through wait-queue rearrangement
will eliminate that cycle.  Since such a reversal might create cycles
elsewhere, we may need to try every possibility.  Therefore, we need to
be able to invoke FindLockCycle() on hypothetical configurations (wait
orders) as well as the current real order.

The easiest way to handle this seems to be to have a lookaside table that
shows the proposed new queue order for each wait queue that we are
considering rearranging.  This table is checked by FindLockCycle, and it
believes the proposed queue order rather than the real order for each lock
that has an entry in the lookaside table.

We build a proposed new queue order by doing a "topological sort" of the
existing entries.  Each soft edge that we are currently considering
reversing creates a property of the partial order that the topological sort
has to enforce.  We must use a sort method that preserves the input
ordering as much as possible, so as not to gratuitously break arrival
order for processes not involved in a deadlock.  (This is not true of the
tsort method shown in Knuth, for example, but it's easily done by a simple
doubly-nested-loop method that emits the first legal candidate at each
step.  Fortunately, we don't need a highly efficient sort algorithm, since
the number of partial order constraints is not likely to be large.)  Note
that failure of the topological sort tells us we have conflicting ordering
constraints, and therefore that the last-added soft edge reversal
conflicts with a prior edge reversal.  We need to detect this case to
avoid an infinite loop in the case where no possible rearrangement will
work: otherwise, we might try a reversal, find that it still leads to
a cycle, then try to un-reverse the reversal while trying to get rid of
that cycle, etc etc.  Topological sort failure tells us the un-reversal
is not a legitimate move in this context.

So, the basic step in our rearrangement method is to take a list of
soft edges in a cycle (as returned by FindLockCycle()) and successively
try the reversal of each one as a topological-sort constraint added to
whatever constraints we are already considering.  We recursively search
through all such sets of constraints to see if any one eliminates all
the deadlock cycles at once.  Although this might seem impossibly
inefficient, it shouldn't be a big problem in practice, because there
will normally be very few, and not very large, deadlock cycles --- if
any at all.  So the combinatorial inefficiency isn't going to hurt us.
Besides, it's better to spend some time to guarantee that we've checked
all possible escape routes than to abort a transaction when we didn't
really have to.

Each edge reversal constraint can be viewed as requesting that the waiting
process A be moved to before the blocking process B in the wait queue they
are both in.  This action will reverse the desired soft edge, as well as
any other soft edges between A and other processes it is advanced over.
No other edges will be affected (note this is actually a constraint on our
topological sort method to not re-order the queue more than necessary.)
Therefore, we can be sure we have not created any new deadlock cycles if
neither FindLockCycle(A) nor FindLockCycle(B) discovers any cycle.  Given
the above-defined behavior of FindLockCycle, each of these searches is
necessary as well as sufficient, since FindLockCycle starting at the
original start point will not complain about cycles that include A or B
but not the original start point.

In short then, a proposed rearrangement of the wait queue(s) is determined
by one or more broken soft edges A->B, fully specified by the output of
topological sorts of each wait queue involved, and then tested by invoking
FindLockCycle() starting at the original start point as well as each of
the mentioned processes (A's and B's).  If none of the tests detect a
cycle, then we have a valid configuration and can implement it by
reordering the wait queues per the sort outputs (and then applying
ProcLockWakeup on each reordered queue, in case a waiter has become wakable).
If any test detects a soft cycle, we can try to resolve it by adding each
soft link in that cycle, in turn, to the proposed rearrangement list.
This is repeated recursively until we either find a workable rearrangement
or determine that none exists.  In the latter case, the outer level
resolves the deadlock by aborting the original start-point transaction.

The particular order in which rearrangements are tried depends on the
order FindLockCycle() happens to scan in, so if there are multiple
workable rearrangements of the wait queues, then it is unspecified which
one will be chosen.  What's more important is that we guarantee to try
every queue rearrangement that could lead to success.  (For example,
if we have A before B before C and the needed order constraints are
C before A and B before C, we would first discover that A before C
doesn't work and try the rearrangement C before A before B.  This would
eventually lead to the discovery of the additional constraint B before C.)

Got that?

Miscellaneous Notes
-------------------

1. It is easily proven that no deadlock will be missed due to our
asynchronous invocation of deadlock checking.  A deadlock cycle in the WFG
is formed when the last edge in the cycle is added; therefore the last
process in the cycle to wait (the one from which that edge is outgoing) is
certain to detect and resolve the cycle when it later runs CheckDeadLock.
This holds even if that edge addition created multiple cycles; the process
may indeed abort without ever noticing those additional cycles, but we
don't particularly care.  The only other possible creation of deadlocks is
during deadlock resolution's rearrangement of wait queues, and we already
saw that that algorithm will prove that it creates no new deadlocks before
it attempts to actually execute any rearrangement.

2. It is not certain that a deadlock will be resolved by aborting the
last-to-wait process.  If earlier waiters in the cycle have not yet run
CheckDeadLock, then the first one to do so will be the victim.

3. No live (wakable) process can be missed by ProcLockWakeup, since it
examines every member of the wait queue (this was not true in the 7.0
implementation, BTW).  Therefore, if ProcLockWakeup is always invoked
after a lock is released or a wait queue is rearranged, there can be no
failure to wake a wakable process.  One should also note that
LockWaitCancel (abort a waiter due to outside factors) must run
ProcLockWakeup, in case the canceled waiter was soft-blocking other
waiters.

4. We can minimize excess rearrangement-trial work by being careful to
scan the wait queue from the front when looking for soft edges.  For
example, if we have queue order A,B,C and C has deadlock conflicts with
both A and B, we want to generate the "C before A" constraint first,
rather than wasting time with "C before B", which won't move C far
enough up.  So we look for soft edges outgoing from C starting at the
front of the wait queue.

5. The working data structures needed by the deadlock detection code can
be limited to numbers of entries computed from MaxBackends.  Therefore,
we can allocate the worst-case space needed during backend startup. This
seems a safer approach than trying to allocate workspace on the fly; we
don't want to risk having the deadlock detector run out of memory, else
we really have no guarantees at all that deadlock will be detected.

6. We abuse the deadlock detector to implement autovacuum cancellation.
When we run the detector and we find that there's an autovacuum worker
involved in the waits-for graph, we store a pointer to its PGPROC, and
return a special return code (unless a hard deadlock has been detected).
The caller can then send a cancellation signal.  This implements the
principle that autovacuum has a low locking priority (eg it must not block
DDL on the table).

User Locks
----------

User locks are handled totally on the application side as long term
cooperative locks which extend beyond the normal transaction boundaries.
Their purpose is to indicate to an application that someone is `working'
on an item.  So it is possible to put an user lock on a tuple's oid,
retrieve the tuple, work on it for an hour and then update it and remove
the lock.  While the lock is active other clients can still read and write
the tuple but they can be aware that it has been locked at the application
level by someone.

User locks and normal locks are completely orthogonal and they don't
interfere with each other.

User locks are always held as session locks, so that they are not released at
transaction end.  They must be released explicitly by the application --- but
they are released automatically when a backend terminates.

Locking during Hot Standby
--------------------------

The Startup process is the only backend that can make changes during
recovery, all other backends are read only.  As a result the Startup
process does not acquire locks on relations or objects except when the lock
level is AccessExclusiveLock.

Regular backends are only allowed to take locks on relations or objects
at RowExclusiveLock or lower. This ensures that they do not conflict with
each other or with the Startup process, unless AccessExclusiveLocks are
requested by one of the backends.

Deadlocks involving AccessExclusiveLocks are not possible, so we need
not be concerned that a user initiated deadlock can prevent recovery from
progressing.

AccessExclusiveLocks on the primary or master node generate WAL records
that are then applied by the Startup process. Locks are released at end
of transaction just as they are in normal processing. These locks are
held by the Startup process, acting as a proxy for the backends that
originally acquired these locks. Again, these locks cannot conflict with
one another, so the Startup process cannot deadlock itself either.
@


1.24
log
@More README src cleanups.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/storage/lmgr/README,v 1.23 2008/03/20 17:55:15 momjian Exp $
d520 24
@


1.23
log
@Make source code READMEs more consistent.  Add CVS tags to all README files.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/storage/lmgr/README,v 1.22 2007/10/26 20:45:10 alvherre Exp $
d4 1
a4 1
----------------
@


1.22
log
@Allow an autovacuum worker to be interrupted automatically when it is found
to be locking another process (except when it's working to prevent Xid
wraparound problems).
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/storage/lmgr/README,v 1.21 2006/09/18 22:40:36 tgl Exp $
d3 2
a4 2

LOCKING OVERVIEW
d48 2
a49 1
LOCK DATA STRUCTURES
d191 2
a192 1
LOCK MANAGER INTERNAL LOCKING
d251 2
a252 1
THE DEADLOCK DETECTION ALGORITHM
d451 2
a452 1
Miscellaneous notes:
d502 2
a503 1
USER LOCKS
@


1.21
log
@Add built-in userlock manipulation functions to replace the former
contrib functionality.  Along the way, remove the USER_LOCKS configuration
symbol, since it no longer makes any sense to try to compile that out.
No user documentation yet ... mmoncure has promised to write some.
Thanks to Abhijit Menon-Sen for creating a first draft to work from.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/storage/lmgr/README,v 1.20 2006/07/23 23:08:46 tgl Exp $
d490 7
@


1.20
log
@Convert the lock manager to use the new dynahash.c support for partitioned
hash tables, instead of the previous kluge involving multiple hash tables.
This partially undoes my patch of last December.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/storage/lmgr/README,v 1.19 2005/12/11 21:02:18 tgl Exp $
d51 1
a51 1
two lock methods: DEFAULT and USER.  (USER locks are non-blocking.)
d505 3
a507 8
User locks are always non blocking, therefore they are never acquired if
already held by another process.  They must be released explicitly by the
application but they are released automatically when a backend terminates.

The lockmode parameter can have the same values as for normal locks although
probably only ExclusiveLock can have some practical use.

	DZ - 22 Nov 1997
@


1.19
log
@Divide the lock manager's shared state into 'partitions', so as to
reduce contention for the former single LockMgrLock.  Per my recent
proposal.  I set it up for 16 partitions, but on a pgbench test this
gives only a marginal further improvement over 4 partitions --- we need
to test more scenarios to choose the number of partitions.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/storage/lmgr/README,v 1.18 2005/12/09 01:22:04 tgl Exp $
d151 15
a165 7
    we always zero out the correct number of bytes.

    tag.lock
        SHMEM offset of the LOCK object this PROCLOCK is for.

    tag.proc
        SHMEM offset of PGPROC of backend process that owns this PROCLOCK.
d202 12
a213 6
its LOCKTAG value (see LockTagToPartition()).  The partition's LWLock is
considered to protect all the LOCK objects of that partition as well as
their subsidiary PROCLOCKs.  The shared-memory hash tables for LOCKs and
PROCLOCKs are divided into separate hash tables for each partition, and
operations on each hash table are likewise protected by the partition
lock.
d243 4
a246 3
the partition number in LOCALLOCK table entries, but this is a straight
speed-for-space tradeoff: we could instead recalculate the partition
number from the LOCKTAG when needed.
@


1.18
log
@Simplify lock manager data structures by making a clear separation between
the data defining the semantics of a lock method (ie, conflict resolution
table and ancillary data, which is all constant) and the hash tables
storing the current state.  The only thing we give up by this is the
ability to use separate hashtables for different lock methods, but there
is no need for that anyway.  Put some extra fields into the LockMethod
definition structs to clean up some other uglinesses, like hard-wired
tests for DEFAULT_LOCKMETHOD and USER_LOCKMETHOD.  This commit doesn't
do anything about the performance issues we were discussing, but it clears
away some of the underbrush that's in the way of fixing that.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/storage/lmgr/README,v 1.17 2005/06/14 22:15:32 tgl Exp $
d53 6
a58 3
Lock modes describe the type of the lock (read/write or shared/exclusive). 
See src/tools/backend/index.html and src/include/storage/lock.h for more
details.
d73 1
a73 1
LockMgrLock need not be obtained to alter them.
d109 2
a110 2
    This is a shared memory queue of all process structures corresponding to
    a backend that is waiting (sleeping) until another backend releases this
d112 1
a112 1
    if it should be woken up when this lock is released.
d137 1
a137 1
    requested, summing the values of granted should total to the value
d141 2
a142 2
0 <= granted[i] <= requested[i] for each i.  If the request counts go to
zero, the lock object is no longer needed and can be freed.
d160 1
a160 1
    A bitmask for the lock types successfully acquired by this PROCLOCK.
d162 2
a163 1
    subset of the PGPROC object's heldLocks mask.
d166 1
a166 1
    A bitmask for the lock types due to be released during LockReleaseAll.
d168 2
a169 2
    taking the LockMgrLock, and therefore it is unsafe for any backend except
    the one owning the PROCLOCK to examine/change it.
d181 54
a234 1
The deadlock detection algorithm:
@


1.17
log
@Simplify shared-memory lock data structures as per recent discussion:
it is sufficient to track whether a backend holds a lock or not, and
store information about transaction vs. session locks only in the
inside-the-backend LocalLockTable.  Since there can now be but one
PROCLOCK per lock per backend, LockCountMyLocks() is no longer needed,
thus eliminating some O(N^2) behavior when a backend holds many locks.
Also simplify the LockAcquire/LockRelease API by passing just a
'sessionLock' boolean instead of a transaction ID.  The previous API
was designed with the idea that per-transaction lock holding would be
important for subtransactions, but now that we have subtransactions we
know that this is unwanted.  While at it, add an 'isTempObject' parameter
to LockAcquire to indicate whether the lock is being taken on a temp
table.  This is not used just yet, but will be needed shortly for
two-phase commit.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/storage/lmgr/README,v 1.16 2005/04/29 22:28:24 tgl Exp $
d154 1
a154 1
        SHMEM offset of PROC of backend process that owns this PROCLOCK.
d418 23
@


1.16
log
@Restructure LOCKTAG as per discussions of a couple months ago.
Essentially, we shoehorn in a lockable-object-type field by taking
a byte away from the lockmethodid, which can surely fit in one byte
instead of two.  This allows less artificial definitions of all the
other fields of LOCKTAG; we can get rid of the special pg_xactlock
pseudo-relation, and also support locks on individual tuples and
general database objects (including shared objects).  None of those
possibilities are actually exploited just yet, however.

I removed pg_xactlock from pg_class, but did not force initdb for
that change.  At this point, relkind 's' (SPECIAL) is unused and
could be removed entirely.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/storage/lmgr/README,v 1.15 2004/08/27 17:07:41 tgl Exp $
d60 1
a60 1
locks held or requested on it.  A PROCLOCK struct exists for each transaction
d66 1
a66 1
be made per lockable object/lock mode/transaction.  Internally to a backend,
d68 3
a70 2
in a transaction.  The internal request counts are held in LOCALLOCK so that
the shared LockMgrLock need not be obtained to alter them.
d116 2
a117 3
    acquires a write, or acquires the lock under two different transaction
    IDs.  (But multiple acquisitions of the same lock/lock mode under the
    same transaction ID are not multiply counted here; they are recorded
a155 12
    tag.xid
        XID of transaction this PROCLOCK is for, or InvalidTransactionId
        if the PROCLOCK is for session-level locking.

    Note that this structure will support multiple transactions running
    concurrently in one backend.  Currently we do not use it for that
    purpose: subtransactions acquire locks in the name of their top parent
    transaction, to simplify reassigning lock ownership at subtransaction end.
    So the XID field is really only needed to distinguish per-transaction
    locks from session locks.  User locks are always session locks, and we
    also use session locks for multi-transaction operations like VACUUM.

d161 6
@


1.15
log
@Introduce local hash table for lock state, as per recent proposal.
PROCLOCK structs in shared memory now have only a bitmask for held
locks, rather than counts (making them 40 bytes smaller, which is a
good thing).  Multiple locks within a transaction are counted in the
local hash table instead, and we have provision for tracking which
ResourceOwner each count belongs to.  Solves recently reported problem
with memory leakage within long transactions.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql-server/src/backend/storage/lmgr/README,v 1.14 2003/11/29 19:51:56 pgsql Exp $
d77 8
a84 24
    lock hash table.  This is declared as a separate struct to ensure that
    we always zero out the correct number of bytes.  It is critical that
    any alignment-padding bytes the compiler might insert in the struct
    be zeroed out, else the hash computation will be random.

    tag.relId -
	Uniquely identifies the relation that the lock corresponds to.
    
    tag.dbId -
	Uniquely identifies the database in which the relation lives.  If
	this is a shared system relation (e.g. pg_database) the dbId must
	be set to 0.

    tag.objId -
	Uniquely identifies the block/page within the relation and the
	tuple within the block.  If we are setting a table level lock
	both the blockId and tupleId (in an item pointer this is called
	the position) are set to invalid, if it is a page level lock the
	blockId is valid, while the tupleId is still invalid.  Finally if
	this is a tuple level lock (we currently never do this) then both
	the blockId and tupleId are set to valid specifications.  This is
	how we get the appearance of a multi-level lock table while using
	only a single table (see Gray's paper on 2 phase locking if
	you are puzzled about how multi-level lock tables work).
@


1.14
log
@
$Header: -> $PostgreSQL Changes ...
@
text
@d1 1
a1 1
$PostgreSQL: /cvsroot/pgsql-server/src/backend/storage/lmgr/README,v 1.13 2003/02/18 03:33:50 momjian Exp $
a49 6
There are two fundamental lock structures: the per-lockable-object LOCK
struct, and the per-lock PROCLOCK struct.  A LOCK object exists
for each lockable object that currently has locks held or requested on it.
A PROCLOCK struct exists for each transaction that is holding or requesting
lock(s) on each LOCK object.

d57 14
d114 1
a114 1
lockHolders -
d131 4
a134 1
    acquires a write, or acquires a read lock twice.
d144 2
a145 3
    This count does not include attempts that are waiting due to conflicts,
    but can count the same backend twice (e.g. a read then a write -- since
    its the same transaction this won't cause a conflict).
d177 11
a187 12
    concurrently in one backend, which may be handy if we someday decide
    to support nested transactions.  Currently, the XID field is only needed
    to distinguish per-transaction locks from session locks.  User locks
    are always session locks, and we also use session locks for multi-
    transaction operations like VACUUM.

holding -
    The number of successfully acquired locks of each type for this PROCLOCK.
    This should be <= the corresponding granted[] value of the lock object!

nHolding -
    Sum of the holding[] array.
@


1.13
log
@Update README.
@
text
@d1 1
a1 1
$Header: /cvsroot/pgsql-server/src/backend/storage/lmgr/README,v 1.12 2002/10/31 21:34:16 tgl Exp $
@


1.12
log
@Code review for statement_timeout patch.  Fix some race conditions
between signal handler and enable/disable code, avoid accumulation of
timing error due to trying to maintain remaining-time instead of
absolute-end-time, disable timeout before commit not after.
@
text
@d1 1
a1 1
$Header: /cvsroot/pgsql-server/src/backend/storage/lmgr/README,v 1.11 2002/07/19 00:17:40 momjian Exp $
d51 1
a51 1
struct, and the per-lock-holder PROCLOCK struct.  A LOCK object exists
@


1.11
log
@Complete TODO item:

* -HOLDER/HOLDERTAB rename to PROCLOCK/PROCLOCKTAG
@
text
@d1 1
a1 1
$Header: /cvsroot/pgsql/src/backend/storage/lmgr/README,v 1.10 2002/04/15 23:46:13 momjian Exp $
d395 1
a395 1
certain to detect and resolve the cycle when it later runs HandleDeadLock.
d405 1
a405 1
HandleDeadLock, then the first one to do so will be the victim.
@


1.10
log
@The attached patch corrects an inaccuracy in src/backend/catalog/README
and fixes a few spelling mistakes in src/bakckend/lmgr/README.

Neil Conway
@
text
@d1 1
a1 1
$Header: /cvsroot/pgsql/src/backend/storage/lmgr/README,v 1.9 2001/09/29 04:02:24 tgl Exp $
d10 34
a43 32
kernel call (or even a call to a nontrivial subroutine), don't use a spinlock.
Spinlocks are primarily used as infrastructure for lightweight locks.
They are implemented using a hardware atomic-test-and-set instruction,
if available.  Waiting processes busy-loop until they can get the lock.
There is no provision for deadlock detection, automatic release on error,
or any other nicety.  There is a timeout if the lock cannot be gotten after
a minute or so (which is approximately forever in comparison to the intended
lock hold time, so this is certainly an error condition).

* Lightweight locks (LWLocks).  These locks are typically used to interlock
access to datastructures in shared memory.  LWLocks support both exclusive
and shared lock modes (for read/write and read-only access to a shared object).
There is no provision for deadlock detection, but the LWLock manager will
automatically release held LWLocks during elog() recovery, so it is safe to
raise an error while holding LWLocks.  Obtaining or releasing an LWLock is
quite fast (a few dozen instructions) when there is no contention for the
lock.  When a process has to wait for an LWLock, it blocks on a SysV semaphore
so as to not consume CPU time.  Waiting processes will be granted the lock
in arrival order.  There is no timeout.

* Regular locks (a/k/a heavyweight locks).  The regular lock manager supports
a variety of lock modes with table-driven semantics, and it has full deadlock
detection and automatic release at transaction end.  Regular locks should be
used for all user-driven lock requests.

Acquisition of either a spinlock or a lightweight lock causes query cancel
and die() interrupts to be held off until all such locks are released.
No such restriction exists for regular locks, however.  Also note that we
can accept query cancel and die() interrupts while waiting for a regular
lock, but we will not accept them while waiting for spinlocks or LW locks.
It is therefore not a good idea to use LW locks when the wait time might
exceed a few seconds.
d51 1
a51 1
struct, and the per-lock-holder HOLDER struct.  A LOCK object exists
d53 1
a53 1
A HOLDER struct exists for each transaction that is holding or requesting
d107 3
a109 3
    This is a shared memory queue of all the HOLDER structs associated with
    the lock object.  Note that both granted and waiting HOLDERs are in this
    list (indeed, the same HOLDER might have some already-granted locks and
d149 1
a149 1
The lock manager's HOLDER objects contain:
d153 1
a153 1
    holder hash table.  This is declared as a separate struct to ensure that
d157 1
a157 1
        SHMEM offset of the LOCK object this holder is for.
d160 1
a160 1
        SHMEM offset of PROC of backend process that owns this holder.
d163 2
a164 2
        XID of transaction this holder is for, or InvalidTransactionId
        if the holder is for session-level locking.
d174 1
a174 1
    The number of successfully acquired locks of each type for this holder.
d181 1
a181 1
    List link for shared memory queue of all the HOLDER objects for the
d185 1
a185 1
    List link for shared memory queue of all the HOLDER objects for the
d198 14
a211 14
(lock grant and release) run quickly when there is no deadlock, and avoid
the overhead of deadlock handling as much as possible.  We do this using
an "optimistic waiting" approach: if a process cannot acquire the lock
it wants immediately, it goes to sleep without any deadlock check.  But
it also sets a delay timer, with a delay of DeadlockTimeout milliseconds
(typically set to one second).  If the delay expires before the process is
granted the lock it wants, it runs the deadlock detection/breaking code.
Normally this code will determine that there is no deadlock condition,
and then the process will go back to sleep and wait quietly until it is
granted the lock.  But if a deadlock condition does exist, it will be
resolved, usually by aborting the detecting process' transaction.  In this
way, we avoid deadlock handling overhead whenever the wait time for a lock
is less than DeadlockTimeout, while not imposing an unreasonable delay of
detection when there is an error.
d215 2
a216 2
1. A lock request is granted immediately if it does not conflict with any
existing or waiting lock request, or if the process already holds an
d218 2
a219 2
lock twice).  Note that a process never conflicts with itself, eg one can
obtain read lock when one already holds exclusive lock.
d221 12
a232 11
2. Otherwise the process joins the lock's wait queue.  Normally it will be
added to the end of the queue, but there is an exception: if the process
already holds locks on this same lockable object that conflict with the
request of any pending waiter, then the process will be inserted in the
wait queue just ahead of the first such waiter.  (If we did not make this
check, the deadlock detection code would adjust the queue order to resolve
the conflict, but it's relatively cheap to make the check in ProcSleep and
avoid a deadlock timeout delay in this case.)  Note special case when
inserting before the end of the queue: if the process's request does not
conflict with any existing lock nor any waiting request before its insertion
point, then go ahead and grant the lock without waiting.
d238 2
a239 2
ensures that conflicting requests are granted in order of arrival.
There are cases where a later waiter must be allowed to go in front of
d248 3
a250 3
lock.  There is a deadlock condition if and only if the WFG contains
a cycle.  We detect cycles by searching outward along waits-for edges
to see if we return to our starting point.  There are three possible
d256 6
a261 6
2. A deadlock is detected by looping back to the start point.  We resolve
such a deadlock by canceling the start point's lock request and reporting
an error in that transaction, which normally leads to transaction abort
and release of that transaction's held locks.  Note that it's sufficient
to cancel one request to remove the cycle; we don't need to kill all the
transactions involved.
d264 5
a268 5
indicates a deadlock, but one that does not involve our starting process.
We ignore this condition on the grounds that resolving such a deadlock
is the responsibility of the processes involved --- killing our start-
point process would not resolve the deadlock.  So, cases 1 and 3 both
report "no deadlock".
d274 4
a277 3
might be multiple holders of (non-conflicting) lock types that all conflict
with the waiter's request.  This creates no real difficulty however; we
simply need to be prepared to trace more than one outgoing edge.
d416 7
a422 6
4. We can minimize excess rearrangement-trial work by being careful to scan
the wait queue from the front when looking for soft edges.  For example,
if we have queue order A,B,C and C has deadlock conflicts with both A and B,
we want to generate the "C before A" constraint first, rather than wasting
time with "C before B", which won't move C far enough up.  So we look for
soft edges outgoing from C starting at the front of the wait queue.
d426 5
a430 4
we can allocate the worst-case space needed during backend startup.
This seems a safer approach than trying to allocate workspace on the fly;
we don't want to risk having the deadlock detector run out of memory,
else we really have no guarantees at all that deadlock will be detected.
@


1.9
log
@Implement new 'lightweight lock manager' that's intermediate between
existing lock manager and spinlocks: it understands exclusive vs shared
lock but has few other fancy features.  Replace most uses of spinlocks
with lightweight locks.  All remaining uses of spinlocks have very short
lock hold times (a few dozen instructions), so tweak spinlock backoff
code to work efficiently given this assumption.  All per my proposal on
pghackers 26-Sep-01.
@
text
@d1 1
a1 1
$Header: /cvsroot/pgsql/src/backend/storage/lmgr/README,v 1.8 2001/01/26 18:23:12 tgl Exp $
d96 1
a96 1
    lock types held.  Conficts are determined by bitwise AND operations
d271 1
a271 1
might be multiple holders of (nonconflicting) lock types that all conflict
d295 1
a295 1
It recursively scans outwards across waits-for edges as discussed above.
d318 1
a318 1
ordering as much as possible, so as not to gratuituously break arrival
d409 1
a409 1
ProcLockWakeup, in case the cancelled waiter was soft-blocking other
@


1.8
log
@Special case in ProcSleep() wasn't sufficiently general: must check to
see if we shouldn't block whenever we insert ourselves anywhere before
the end of the queue, not only at the front.
@
text
@d1 46
a46 1
$Header: /home/projects/pgsql/cvsroot/pgsql/src/backend/storage/lmgr/README,v 1.7 2001/01/25 03:31:16 tgl Exp $
@


1.7
log
@Re-implement deadlock detection and resolution, per design notes posted
to pghackers on 18-Jan-01.
@
text
@d1 1
a1 1
$Header: /home/projects/pgsql/cvsroot/pgsql/src/backend/storage/lmgr/README,v 1.6 2001/01/22 22:30:06 tgl Exp $
d181 4
a184 5
avoid a deadlock timeout delay in this case.)  Note special case: if the
process holds locks that conflict with the first waiter, so that it would
go at the front of the queue, and its request does not conflict with the
already-granted locks, then the process will be granted the lock without
going to sleep at all.
d194 1
a194 1
deadlock detection code re-orders the wait queue when necessary.
d265 2
a266 2
considering rearranging.  This table is passed to FindLockCycle, and it
believes the given queue order rather than the "real" order for each lock
d271 1
a271 1
reversing is a property of the partial order that the topological sort
@


1.6
log
@Clean up lockmanager data structures some more, in preparation for planned
rewrite of deadlock checking.  Lock holder objects are now reachable from
the associated LOCK as well as from the owning PROC.  This makes it
practical to find all the processes holding a lock, as well as all those
waiting on the lock.  Also, clean up some of the grottier aspects of the
SHMQueue API, and cause the waitProcs list to be stored in the intuitive
direction instead of the nonintuitive one.  (Bet you didn't know that
the code followed the 'prev' link to get to the next waiting process,
instead of the 'next' link.  It doesn't do that anymore.)
@
text
@d1 1
a1 1
$Header: /home/projects/pgsql/cvsroot/pgsql/src/backend/storage/lmgr/README,v 1.5 2001/01/16 06:11:34 tgl Exp $
d376 5
a380 4
be proven not to need more than MAXBACKENDS entries.  Therefore the
working storage can be statically allocated instead of depending on
palloc().  This is a good thing, since if the deadlock detector could
fail for extraneous reasons, all the above safety proofs fall down.
@


1.5
log
@Rename fields of lock and lockholder structures to something a tad less
confusing, and clean up documentation.
@
text
@d1 1
a1 1
$Header: /home/projects/pgsql/cvsroot/pgsql/src/backend/storage/lmgr/README,v 1.4 2000/12/22 00:51:54 tgl Exp $
d18 1
a18 1
The lock manager's LOCK:
d32 2
a33 2
	this is a shared system relation (e.g. pg_user) the dbId should be
	set to 0.
d35 1
a35 1
    tag.tupleId -
d59 6
d102 1
a102 1
The lock manager's HOLDER:
d112 2
a113 2
    tag.pid
        PID of backend process that owns this holder.
d133 5
a137 1
queue -
d140 240
@


1.4
log
@Revise lock manager to support "session level" locks as well as "transaction
level" locks.  A session lock is not released at transaction commit (but it
is released on transaction abort, to ensure recovery after an elog(ERROR)).
In VACUUM, use a session lock to protect the master table while vacuuming a
TOAST table, so that the TOAST table can be done in an independent
transaction.

I also took this opportunity to do some cleanup and renaming in the lock
code.  The previously noted bug in ProcLockWakeup, that it couldn't wake up
any waiters beyond the first non-wakeable waiter, is now fixed.  Also found
a previously unknown bug of the same kind (failure to scan all members of
a lock queue in some cases) in DeadLockCheck.  This might have led to failure
to detect a deadlock condition, resulting in indefinite waits, but it's
difficult to characterize the conditions required to trigger a failure.
@
text
@d1 1
a1 1
$Header: /home/projects/pgsql/cvsroot/pgsql/src/backend/storage/lmgr/README,v 1.3 1998/07/06 18:16:07 momjian Exp $
d47 2
a48 2
mask -
    This field indicates what types of locks are currently held on the
d50 1
a50 1
    table) to determine if the new lock request will conflict with existing
d52 6
a57 4
    between the mask and the conflict table entry for the given lock type
    to be set.  The current representation is that each bit (1 through 5)
    is set when that lock type (WRITE, READ, WRITE INTENT, READ INTENT, EXTEND)
    has been acquired for the lock.
d63 1
a63 6
    if it should be woken up when this lock is released.  If, for example,
    we are releasing a read lock and the process is sleeping trying to acquire
    a read lock then there is no point in waking it since the lock being
    released isn't what caused it to sleep in the first place.  There will
    be more on this below (when I get to releasing locks and waking sleeping
    process routines).
d65 1
a65 1
nHolding -
d70 1
a70 1
    acquires a write.
d72 1
a72 1
holders -
d74 3
a76 3
    elements 1 through MAX_LOCK_TYPES are used as they correspond to the lock
    type defined constants (WRITE through EXTEND).  Summing the values of
    holders should come out equal to nHolding.
d78 2
a79 2
nActive -
    Keeps a count of how many times this lock has been succesfully acquired.
d82 1
a82 1
    its the same transaction this won't cause a conflict)
d84 9
a92 5
activeHolders -
    Keeps a count of how locks of each type are currently held.  Once again
    only elements 1 through MAX_LOCK_TYPES are used (0 is not).  Also, like
    holders, summing the values of activeHolders should total to the value
    of nActive.
d120 1
a120 1
holders -
d122 1
a122 3
    (CAUTION: the semantics are not the same as the LOCK's holder[], which
    counts both acquired and pending requests.  Probably a different name
    should be used...)
d125 1
a125 1
    Sum of the holders[] array.
@


1.3
log
@Update lock manager README.
@
text
@d1 1
a1 1
$Header: /usr/local/cvsroot/pgsql/src/backend/storage/lmgr/README,v 1.2 1998/01/28 02:29:26 momjian Exp $
d3 10
a12 3
There are two fundemental lock structures.  Lock methods describe the
locking behavior.  We currently only support multi-level locking.  Lock
modes describe the mode of the lock(read/write or shared/exclusive). 
d22 4
a25 4
    lock hash table.  This is kept as a separate struct to ensure that we
    always zero out the correct number of bytes.  This is a problem as
    part of the tag is an itempointer which is 6 bytes and causes 2
    additional bytes to be added as padding.
d40 1
a40 1
	blockId is valid, while the tuleId is still invalid.  Finally if
d48 3
a50 3
    This field indicates what types of locks are currently held in the
    given lock.  It is used (against the lock table's conflict table)
    to determine if the new lock request will conflict with existing
d83 1
a83 1
    This count does not include attempts that were rejected due to conflicts,
d95 36
@


1.2
log
@More deadlock code to check for escallation locks.

offsetof() addition to local socket size.
@
text
@d1 1
a1 1
$Header: /usr/local/cvsroot/pgsql/src/backend/storage/lmgr/README,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
d3 5
a7 7
This file is an attempt to save me (and future code maintainers) some
time and a lot of headaches.  The existing lock manager code at the time
of this writing (June 16 1992) can best be described as confusing.  The
complexity seems inherent in lock manager functionality, but variable
names chosen in the current implementation really confuse me everytime
I have to track down a bug.  Also, what gets done where and by whom isn't
always clear....
d9 1
a9 4
Starting with the data structures the lock manager relies upon...

(NOTE - these will undoubtedly change over time and it is likely
that this file won't always be updated along with the structs.)
a87 7
Locks are accessed in two ways.  Each PROC structure has a lockQueue,
that is a circular linked list of LOCK pointers that this process holds
or is waiting on.

Second, there is a hash table that can do a lookup by combined LOCK
address and transaction id(xid) which allows a process to see what
type of locks it holds on that table.
@


1.1
log
@Initial revision
@
text
@d1 1
a1 1
$Header: /usr/local/cvsroot/postgres95/postgres95/src/backend/storage/lmgr/README,v 1.1.1.1 1996/07/09 05:32:16 scrappy Exp $
d91 1
d93 7
a99 2
This is all I had the stomach for right now..... I will get back to this
someday.	-mer 17 June 1992 12:00 am
@


1.1.1.1
log
@Postgres95 1.01 Distribution - Virgin Sources
@
text
@@
