head	1.15;
access;
symbols
	REL9_0_0:1.15
	REL9_1_ALPHA1:1.15
	REL9_0_RC1:1.15
	REL9_0_BETA4:1.15
	REL9_0_STABLE:1.15.0.14
	REL9_0_BETA3:1.15
	REL9_0_BETA2:1.15
	REL7_4_29:1.4
	REL8_0_25:1.8
	REL8_1_21:1.8
	REL8_2_17:1.9
	REL8_3_11:1.11
	REL8_4_4:1.15
	REL9_0_BETA1:1.15
	REL9_0_ALPHA5_BRANCH:1.15.0.12
	REL9_0_ALPHA5:1.15
	REL7_4_28:1.4
	REL8_0_24:1.8
	REL8_1_20:1.8
	REL8_2_16:1.9
	REL8_3_10:1.11
	REL8_4_3:1.15
	REL9_0_ALPHA4:1.15
	REL9_0_ALPHA4_BRANCH:1.15.0.10
	REL8_5_ALPHA3:1.15
	REL8_5_ALPHA3_BRANCH:1.15.0.8
	REL7_4_27:1.4
	REL8_0_23:1.8
	REL8_1_19:1.8
	REL8_2_15:1.9
	REL8_3_9:1.11
	REL8_4_2:1.15
	REL8_5_ALPHA2:1.15
	REL8_5_ALPHA2_BRANCH:1.15.0.6
	REL7_4_26:1.4
	REL8_0_22:1.8
	REL8_1_18:1.8
	REL8_2_14:1.9
	REL8_3_8:1.11
	REL8_4_1:1.15
	REL8_5_ALPHA1:1.15
	REL8_5_ALPHA1_BRANCH:1.15.0.4
	REL8_4_STABLE:1.15.0.2
	REL8_4_0:1.15
	REL8_4_RC2:1.15
	REL8_4_RC1:1.15
	REL8_4_BETA2:1.15
	REL8_4_BETA1:1.15
	REL7_4_25:1.4
	REL8_0_21:1.8
	REL8_1_17:1.8
	REL8_2_13:1.9
	REL8_3_7:1.11
	REL7_4_24:1.4
	REL8_0_20:1.8
	REL8_1_16:1.8
	REL8_2_12:1.9
	REL8_3_6:1.11
	REL7_4_23:1.4
	REL8_0_19:1.8
	REL8_1_15:1.8
	REL8_2_11:1.9
	REL8_3_5:1.11
	REL7_4_22:1.4
	REL8_0_18:1.8
	REL8_1_14:1.8
	REL8_2_10:1.9
	REL8_3_4:1.11
	REL7_4_21:1.4
	REL8_0_17:1.8
	REL8_1_13:1.8
	REL8_2_9:1.9
	REL8_3_3:1.11
	REL7_4_20:1.4
	REL8_0_16:1.8
	REL8_1_12:1.8
	REL8_2_8:1.9
	REL8_3_2:1.11
	REL8_2_7:1.9
	REL8_3_1:1.11
	REL8_3_STABLE:1.11.0.2
	REL8_3_0:1.11
	REL8_3_RC2:1.11
	REL7_3_21:1.3
	REL7_4_19:1.4
	REL8_0_15:1.8
	REL8_1_11:1.8
	REL8_2_6:1.9
	REL8_3_RC1:1.11
	REL8_3_BETA4:1.11
	REL8_3_BETA3:1.11
	REL8_3_BETA2:1.11
	REL8_3_BETA1:1.11
	REL7_3_20:1.3
	REL7_4_18:1.4
	REL8_0_14:1.8
	REL8_1_10:1.8
	REL8_2_5:1.9
	REL7_3_19:1.3
	REL7_4_17:1.4
	REL8_0_13:1.8
	REL8_1_9:1.8
	REL8_2_4:1.9
	REL8_0_12:1.8
	REL8_1_8:1.8
	REL8_2_3:1.9
	REL7_3_18:1.3
	REL7_4_16:1.4
	REL8_0_11:1.8
	REL8_1_7:1.8
	REL8_2_2:1.9
	REL8_0_10:1.8
	REL8_1_6:1.8
	REL8_2_1:1.9
	REL7_4_15:1.4
	REL7_3_17:1.3
	REL8_2_STABLE:1.9.0.2
	REL8_2_0:1.9
	REL8_2_RC1:1.9
	REL8_2_BETA3:1.9
	REL8_2_BETA2:1.9
	REL8_1_5:1.8
	REL8_0_9:1.8
	REL7_4_14:1.4
	REL7_3_16:1.3
	REL8_2_BETA1:1.9
	REL7_3_15:1.3
	REL7_4_13:1.4
	REL8_0_8:1.8
	REL8_1_4:1.8
	REL7_3_14:1.3
	REL7_4_12:1.4
	REL8_0_7:1.8
	REL8_1_3:1.8
	REL7_3_13:1.3
	REL7_4_11:1.4
	REL8_0_6:1.8
	REL8_1_2:1.8
	REL7_3_12:1.3
	REL7_4_10:1.4
	REL8_0_5:1.8
	REL8_1_1:1.8
	REL8_1_STABLE:1.8.0.6
	REL8_1_0:1.8
	REL8_1_0RC1:1.8
	REL8_1_0BETA4:1.8
	REL8_1_0BETA3:1.8
	REL7_3_11:1.3
	REL7_4_9:1.4
	REL8_0_4:1.8
	REL8_1_0BETA2:1.8
	REL8_1_0BETA1:1.8
	REL7_2_8:1.3
	REL7_3_10:1.3
	REL7_4_8:1.4
	REL8_0_3:1.8
	REL8_0_2:1.8
	REL7_2_7:1.3
	REL7_3_9:1.3
	REL7_4_7:1.4
	REL8_0_1:1.8
	REL8_0_STABLE:1.8.0.4
	REL8_0_0:1.8.0.2
	REL8_0_0RC5:1.8
	REL8_0_0RC4:1.8
	REL8_0_0RC3:1.8
	REL8_0_0RC2:1.8
	REL8_0_0RC1:1.8
	REL8_0_0BETA5:1.8
	REL8_0_0BETA4:1.8
	REL7_4_6:1.4
	REL7_3_8:1.3
	REL7_2_6:1.3
	REL8_0_0BETA3:1.8
	REL8_0_0BETA2:1.8
	REL7_2_5:1.3
	REL7_4_5:1.4
	REL7_3_7:1.3
	REL7_4_4:1.4
	REL8_0_0BETA1:1.8
	REL7_4_3:1.4
	REL7_4_2:1.4
	REL7_3_6:1.3
	REL7_4_1:1.4
	REL7_3_5:1.3
	REL7_4:1.4
	REL7_4_RC2:1.4
	REL7_4_STABLE:1.4.0.4
	REL7_4_RC1:1.4
	REL7_4_BETA5:1.4
	REL7_4_BETA4:1.4
	REL7_4_BETA3:1.4
	REL7_4_BETA2:1.4
	WIN32_DEV:1.4.0.2
	REL7_4_BETA1:1.4
	REL7_3_4:1.3
	REL7_3_2:1.3
	REL7_2_4:1.3
	REL7_3_STABLE:1.3.0.6
	REL7_2_3:1.3
	REL7_2_STABLE:1.3.0.4
	REL7_2:1.3
	REL7_2_RC2:1.3
	REL7_2_RC1:1.3
	REL7_2_BETA5:1.3
	REL7_2_BETA4:1.3
	REL7_2_BETA3:1.3
	REL7_2_BETA2:1.3
	REL7_2_BETA1:1.3
	REL7_1_2:1.3
	REL7_1_STABLE:1.3.0.2
	REL7_1_BETA:1.2
	REL7_1_BETA3:1.2
	REL7_1_BETA2:1.2
	REL7_1:1.3;
locks; strict;
comment	@# @;


1.15
date	2008.04.09.01.00.46;	author momjian;	state Exp;
branches;
next	1.14;

1.14
date	2008.04.09.00.59.24;	author momjian;	state Exp;
branches;
next	1.13;

1.13
date	2008.04.09.00.55.30;	author momjian;	state Exp;
branches;
next	1.12;

1.12
date	2008.03.20.17.55.15;	author momjian;	state Exp;
branches;
next	1.11;

1.11
date	2007.05.29.04.19.35;	author neilc;	state Exp;
branches;
next	1.10;

1.10
date	2007.03.13.00.33.42;	author tgl;	state Exp;
branches;
next	1.9;

1.9
date	2006.09.07.22.52.01;	author tgl;	state Exp;
branches;
next	1.8;

1.8
date	2004.08.04.21.34.04;	author tgl;	state Exp;
branches;
next	1.7;

1.7
date	2004.07.01.00.51.29;	author tgl;	state Exp;
branches;
next	1.6;

1.6
date	2004.06.05.19.48.09;	author tgl;	state Exp;
branches;
next	1.5;

1.5
date	2003.11.29.19.52.04;	author pgsql;	state Exp;
branches;
next	1.4;

1.4
date	2003.04.30.19.04.12;	author tgl;	state Exp;
branches;
next	1.3;

1.3
date	2001.02.15.21.38.26;	author tgl;	state Exp;
branches;
next	1.2;

1.2
date	2000.07.15.00.52.22;	author tgl;	state Exp;
branches;
next	1.1;

1.1
date	2000.06.28.03.32.50;	author tgl;	state Exp;
branches;
next	;


desc
@@


1.15
log
@Small wording improvements for source code READMEs.
@
text
@$PostgreSQL: pgsql/src/backend/utils/mmgr/README,v 1.14 2008/04/09 00:59:24 momjian Exp $

Notes About Memory Allocation Redesign
======================================

Up through version 7.0, Postgres had serious problems with memory leakage
during large queries that process a lot of pass-by-reference data.  There
was no provision for recycling memory until end of query.  This needed to be
fixed, even more so with the advent of TOAST which will allowed very large
chunks of data to be passed around in the system.  This document describes
the new memory management system implemented in 7.1.


Background
----------

We already do most of our memory allocation in "memory contexts", which
are usually AllocSets as implemented by backend/utils/mmgr/aset.c.  What
we need to do is create more contexts and define proper rules about when
they can be freed.

The basic operations on a memory context are:

* create a context

* allocate a chunk of memory within a context (equivalent of standard
  C library's malloc())

* delete a context (including freeing all the memory allocated therein)

* reset a context (free all memory allocated in the context, but not the
  context object itself)

Given a chunk of memory previously allocated from a context, one can
free it or reallocate it larger or smaller (corresponding to standard
library's free() and realloc() routines).  These operations return memory
to or get more memory from the same context the chunk was originally
allocated in.

At all times there is a "current" context denoted by the
CurrentMemoryContext global variable.  The backend macro palloc()
implicitly allocates space in that context.  The MemoryContextSwitchTo()
operation selects a new current context (and returns the previous context,
so that the caller can restore the previous context before exiting).

The main advantage of memory contexts over plain use of malloc/free is
that the entire contents of a memory context can be freed easily, without
having to request freeing of each individual chunk within it.  This is
both faster and more reliable than per-chunk bookkeeping.  We already use
this fact to clean up at transaction end: by resetting all the active
contexts, we reclaim all memory.  What we need are additional contexts
that can be reset or deleted at strategic times within a query, such as
after each tuple.


Some Notes About the palloc API Versus Standard C Library
---------------------------------------------------------

The behavior of palloc and friends is similar to the standard C library's
malloc and friends, but there are some deliberate differences too.  Here
are some notes to clarify the behavior.

* If out of memory, palloc and repalloc exit via elog(ERROR).  They never
return NULL, and it is not necessary or useful to test for such a result.

* palloc(0) is explicitly a valid operation.  It does not return a NULL
pointer, but a valid chunk of which no bytes may be used.  (However, the
chunk might later be repalloc'd larger; it can also be pfree'd without
error.)  (Note: this behavior is new in Postgres 8.0; earlier versions
disallowed palloc(0).  It seems more consistent to allow it, however.)
Similarly, repalloc allows realloc'ing to zero size.

* pfree and repalloc do not accept a NULL pointer.  This is intentional.


pfree/repalloc No Longer Depend On CurrentMemoryContext
-------------------------------------------------------

In this proposal, pfree() and repalloc() can be applied to any chunk
whether it belongs to CurrentMemoryContext or not --- the chunk's owning
context will be invoked to handle the operation, regardless.  This is a
change from the old requirement that CurrentMemoryContext must be set
to the same context the memory was allocated from before one can use
pfree() or repalloc().  The old coding requirement is obviously fairly
error-prone, and will become more so the more context-switching we do;
so I think it's essential to use CurrentMemoryContext only for palloc.
We can avoid needing it for pfree/repalloc by putting restrictions on
context managers as discussed below.

We could even consider getting rid of CurrentMemoryContext entirely,
instead requiring the target memory context for allocation to be specified
explicitly.  But I think that would be too much notational overhead ---
we'd have to pass an appropriate memory context to called routines in
many places.  For example, the copyObject routines would need to be passed
a context, as would function execution routines that return a
pass-by-reference datatype.  And what of routines that temporarily
allocate space internally, but don't return it to their caller?  We
certainly don't want to clutter every call in the system with "here is
a context to use for any temporary memory allocation you might want to
do".  So there'd still need to be a global variable specifying a suitable
temporary-allocation context.  That might as well be CurrentMemoryContext.


Additions to the Memory-Context Mechanism
-----------------------------------------

If we are going to have more contexts, we need more mechanism for keeping
track of them; else we risk leaking whole contexts under error conditions.

We can do this by creating trees of "parent" and "child" contexts.  When
creating a memory context, the new context can be specified to be a child
of some existing context.  A context can have many children, but only one
parent.  In this way the contexts form a forest (not necessarily a single
tree, since there could be more than one top-level context).

We then say that resetting or deleting any particular context resets or
deletes all its direct and indirect children as well.  This feature allows
us to manage a lot of contexts without fear that some will be leaked; we
only need to keep track of one top-level context that we are going to
delete at transaction end, and make sure that any shorter-lived contexts
we create are descendants of that context.  Since the tree can have
multiple levels, we can deal easily with nested lifetimes of storage,
such as per-transaction, per-statement, per-scan, per-tuple.  Storage
lifetimes that only partially overlap can be handled by allocating
from different trees of the context forest (there are some examples
in the next section).

For convenience we will also want operations like "reset/delete all
children of a given context, but don't reset or delete that context
itself".


Globally Known Contexts
-----------------------

There will be several widely-known contexts that will typically be
referenced through global variables.  At any instant the system may
contain many additional contexts, but all other contexts should be direct
or indirect children of one of these contexts to ensure they are not
leaked in event of an error.

TopMemoryContext --- this is the actual top level of the context tree;
every other context is a direct or indirect child of this one.  Allocating
here is essentially the same as "malloc", because this context will never
be reset or deleted.  This is for stuff that should live forever, or for
stuff that the controlling module will take care of deleting at the
appropriate time.  An example is fd.c's tables of open files, as well as
the context management nodes for memory contexts themselves.  Avoid
allocating stuff here unless really necessary, and especially avoid
running with CurrentMemoryContext pointing here.

PostmasterContext --- this is the postmaster's normal working context.
After a backend is spawned, it can delete PostmasterContext to free its
copy of memory the postmaster was using that it doesn't need.  (Anything
that has to be passed from postmaster to backends will be passed in
TopMemoryContext.  The postmaster will have only TopMemoryContext,
PostmasterContext, and ErrorContext --- the remaining top-level contexts
will be set up in each backend during startup.)

CacheMemoryContext --- permanent storage for relcache, catcache, and
related modules.  This will never be reset or deleted, either, so it's
not truly necessary to distinguish it from TopMemoryContext.  But it
seems worthwhile to maintain the distinction for debugging purposes.
(Note: CacheMemoryContext will have child-contexts with shorter lifespans.
For example, a child context is the best place to keep the subsidiary
storage associated with a relcache entry; that way we can free rule
parsetrees and so forth easily, without having to depend on constructing
a reliable version of freeObject().)

MessageContext --- this context holds the current command message from the
frontend, as well as any derived storage that need only live as long as
the current message (for example, in simple-Query mode the parse and plan
trees can live here).  This context will be reset, and any children
deleted, at the top of each cycle of the outer loop of PostgresMain.  This
is kept separate from per-transaction and per-portal contexts because a
query string might need to live either a longer or shorter time than any
single transaction or portal.

TopTransactionContext --- this holds everything that lives until end of the
top-level transaction.  This context will be reset, and all its children
deleted, at conclusion of each top-level transaction cycle.  In most cases
you don't want to allocate stuff directly here, but in CurTransactionContext;
what does belong here is control information that exists explicitly to manage
status across multiple subtransactions.  Note: this context is NOT cleared
immediately upon error; its contents will survive until the transaction block
is exited by COMMIT/ROLLBACK.

CurTransactionContext --- this holds data that has to survive until the end
of the current transaction, and in particular will be needed at top-level
transaction commit.  When we are in a top-level transaction this is the same
as TopTransactionContext, but in subtransactions it points to a child context.
It is important to understand that if a subtransaction aborts, its
CurTransactionContext is thrown away after finishing the abort processing;
but a committed subtransaction's CurTransactionContext is kept until top-level
commit (unless of course one of the intermediate levels of subtransaction
aborts).  This ensures that we do not keep data from a failed subtransaction
longer than necessary.  Because of this behavior, you must be careful to clean
up properly during subtransaction abort --- the subtransaction's state must be
delinked from any pointers or lists kept in upper transactions, or you will
have dangling pointers leading to a crash at top-level commit.  An example of
data kept here is pending NOTIFY messages, which are sent at top-level commit,
but only if the generating subtransaction did not abort.

PortalContext --- this is not actually a separate context, but a
global variable pointing to the per-portal context of the currently active
execution portal.  This can be used if it's necessary to allocate storage
that will live just as long as the execution of the current portal requires.

ErrorContext --- this permanent context will be switched into for error
recovery processing, and then reset on completion of recovery.  We'll
arrange to have, say, 8K of memory available in it at all times.  In this
way, we can ensure that some memory is available for error recovery even
if the backend has run out of memory otherwise.  This allows out-of-memory
to be treated as a normal ERROR condition, not a FATAL error.


Contexts For Prepared Statements And Portals
--------------------------------------------

A prepared-statement object has an associated private context, in which
the parse and plan trees for its query are stored.  Because these trees
are read-only to the executor, the prepared statement can be re-used many
times without further copying of these trees.

An execution-portal object has a private context that is referenced by
PortalContext when the portal is active.  In the case of a portal created
by DECLARE CURSOR, this private context contains the query parse and plan
trees (there being no other object that can hold them).  Portals created
from prepared statements simply reference the prepared statements' trees,
and won't actually need any storage allocated in their private contexts.


Transient Contexts During Execution
-----------------------------------

When creating a prepared statement, the parse and plan trees will be built
in a temporary context that's a child of MessageContext (so that it will
go away automatically upon error).  On success, the finished plan is
copied to the prepared statement's private context, and the temp context
is released; this allows planner temporary space to be recovered before
execution begins.  (In simple-Query mode we'll not bother with the extra
copy step, so the planner temp space stays around till end of query.)

The top-level executor routines, as well as most of the "plan node"
execution code, will normally run in a context that is created by
ExecutorStart and destroyed by ExecutorEnd; this context also holds the
"plan state" tree built during ExecutorStart.  Most of the memory
allocated in these routines is intended to live until end of query,
so this is appropriate for those purposes.  The executor's top context
is a child of PortalContext, that is, the per-portal context of the
portal that represents the query's execution.

The main improvement needed in the executor is that expression evaluation
--- both for qual testing and for computation of targetlist entries ---
needs to not leak memory.  To do this, each ExprContext (expression-eval
context) created in the executor will now have a private memory context
associated with it, and we'll arrange to switch into that context when
evaluating expressions in that ExprContext.  The plan node that owns the
ExprContext is responsible for resetting the private context to empty
when it no longer needs the results of expression evaluations.  Typically
the reset is done at the start of each tuple-fetch cycle in the plan node.

Note that this design gives each plan node its own expression-eval memory
context.  This appears necessary to handle nested joins properly, since
an outer plan node might need to retain expression results it has computed
while obtaining the next tuple from an inner node --- but the inner node
might execute many tuple cycles and many expressions before returning a
tuple.  The inner node must be able to reset its own expression context
more often than once per outer tuple cycle.  Fortunately, memory contexts
are cheap enough that giving one to each plan node doesn't seem like a
problem.

A problem with running index accesses and sorts in a query-lifespan context
is that these operations invoke datatype-specific comparison functions,
and if the comparators leak any memory then that memory won't be recovered
till end of query.  The comparator functions all return bool or int32,
so there's no problem with their result data, but there can be a problem
with leakage of internal temporary data.  In particular, comparator
functions that operate on TOAST-able data types will need to be careful
not to leak detoasted versions of their inputs.  This is annoying, but
it appears a lot easier to make the comparators conform than to fix the
index and sort routines, so that's what I propose to do for 7.1.  Further
cleanup can be left for another day.

There will be some special cases, such as aggregate functions.  nodeAgg.c
needs to remember the results of evaluation of aggregate transition
functions from one tuple cycle to the next, so it can't just discard
all per-tuple state in each cycle.  The easiest way to handle this seems
to be to have two per-tuple contexts in an aggregate node, and to
ping-pong between them, so that at each tuple one is the active allocation
context and the other holds any results allocated by the prior cycle's
transition function.

Executor routines that switch the active CurrentMemoryContext may need
to copy data into their caller's current memory context before returning.
I think there will be relatively little need for that, because of the
convention of resetting the per-tuple context at the *start* of an
execution cycle rather than at its end.  With that rule, an execution
node can return a tuple that is palloc'd in its per-tuple context, and
the tuple will remain good until the node is called for another tuple
or told to end execution.  This is pretty much the same state of affairs
that exists now, since a scan node can return a direct pointer to a tuple
in a disk buffer that is only guaranteed to remain good that long.

A more common reason for copying data will be to transfer a result from
per-tuple context to per-run context; for example, a Unique node will
save the last distinct tuple value in its per-run context, requiring a
copy step.

Another interesting special case is VACUUM, which needs to allocate
working space that will survive its forced transaction commits, yet
be released on error.  Currently it does that through a "portal",
which is essentially a child context of TopMemoryContext.  While that
way still works, it's ugly since xact abort needs special processing
to delete the portal.  Better would be to use a context that's a child
of PortalContext and hence is certain to go away as part of normal
processing.  (Eventually we might have an even better solution from
nested transactions, but this'll do fine for now.)


Mechanisms to Allow Multiple Types of Contexts
----------------------------------------------

We may want several different types of memory contexts with different
allocation policies but similar external behavior.  To handle this,
memory allocation functions will be accessed via function pointers,
and we will require all context types to obey the conventions given here.
(This is not very far different from the existing code.)

A memory context will be represented by an object like

typedef struct MemoryContextData
{
    NodeTag        type;           /* identifies exact kind of context */
    MemoryContextMethods methods;
    MemoryContextData *parent;     /* NULL if no parent (toplevel context) */
    MemoryContextData *firstchild; /* head of linked list of children */
    MemoryContextData *nextchild;  /* next child of same parent */
    char          *name;           /* context name (just for debugging) */
} MemoryContextData, *MemoryContext;

This is essentially an abstract superclass, and the "methods" pointer is
its virtual function table.  Specific memory context types will use
derived structs having these fields as their first fields.  All the
contexts of a specific type will have methods pointers that point to the
same static table of function pointers, which will look like

typedef struct MemoryContextMethodsData
{
    Pointer     (*alloc) (MemoryContext c, Size size);
    void        (*free_p) (Pointer chunk);
    Pointer     (*realloc) (Pointer chunk, Size newsize);
    void        (*reset) (MemoryContext c);
    void        (*delete) (MemoryContext c);
} MemoryContextMethodsData, *MemoryContextMethods;

Alloc, reset, and delete requests will take a MemoryContext pointer
as parameter, so they'll have no trouble finding the method pointer
to call.  Free and realloc are trickier.  To make those work, we will
require all memory context types to produce allocated chunks that
are immediately preceded by a standard chunk header, which has the
layout

typedef struct StandardChunkHeader
{
    MemoryContext mycontext;         /* Link to owning context object */
    Size          size;              /* Allocated size of chunk */
};

It turns out that the existing aset.c memory context type does this
already, and probably any other kind of context would need to have the
same data available to support realloc, so this is not really creating
any additional overhead.  (Note that if a context type needs more per-
allocated-chunk information than this, it can make an additional
nonstandard header that precedes the standard header.  So we're not
constraining context-type designers very much.)

Given this, the pfree routine will look something like

    StandardChunkHeader * header = 
        (StandardChunkHeader *) ((char *) p - sizeof(StandardChunkHeader));

    (*header->mycontext->methods->free_p) (p);

We could do it as a macro, but the macro would have to evaluate its
argument twice, which seems like a bad idea (the current pfree macro
does not do that).  This is already saving two levels of function call
compared to the existing code, so I think we're doing fine without
squeezing out that last little bit ...


More Control Over aset.c Behavior
---------------------------------

Currently, aset.c allocates an 8K block upon the first allocation in
a context, and doubles that size for each successive block request.
That's good behavior for a context that might hold *lots* of data, and
the overhead wasn't bad when we had only a few contexts in existence.
With dozens if not hundreds of smaller contexts in the system, we will
want to be able to fine-tune things a little better.

The creator of a context will be able to specify an initial block size
and a maximum block size.  Selecting smaller values will prevent wastage
of space in contexts that aren't expected to hold very much (an example is
the relcache's per-relation contexts).

Also, it will be possible to specify a minimum context size.  If this
value is greater than zero then a block of that size will be grabbed
immediately upon context creation, and cleared but not released during
context resets.  This feature is needed for ErrorContext (see above),
but will most likely not be used for other contexts.

We expect that per-tuple contexts will be reset frequently and typically
will not allocate very much space per tuple cycle.  To make this usage
pattern cheap, the first block allocated in a context is not given
back to malloc() during reset, but just cleared.  This avoids malloc
thrashing.


Other Notes
-----------

The original version of this proposal suggested that functions returning
pass-by-reference datatypes should be required to return a value freshly
palloc'd in their caller's memory context, never a pointer to an input
value.  I've abandoned that notion since it clearly is prone to error.
In the current proposal, it is possible to discover which context a
chunk of memory is allocated in (by checking the required standard chunk
header), so nodeAgg can determine whether or not it's safe to reset
its working context; it doesn't have to rely on the transition function
to do what it's expecting.
@


1.14
log
@Revert README cleanups.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/utils/mmgr/README,v 1.12 2008/03/20 17:55:15 momjian Exp $
d8 2
a9 2
was no provision for recycling memory until end of query.  This needs to be
fixed, even more so with the advent of TOAST which will allow very large
d11 1
a11 1
the new memory management plan implemented in 7.1.
@


1.13
log
@Revert sentence removal from nickname in FAQ.
@
text
@d8 2
a9 2
was no provision for recycling memory until end of query.  This needed to be
fixed, even more so with the advent of TOAST which will allowed very large
d11 1
a11 1
the new memory management system implemented in 7.1.
@


1.12
log
@Make source code READMEs more consistent.  Add CVS tags to all README files.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/utils/mmgr/README,v 1.11 2007/05/29 04:19:35 neilc Exp $
d8 2
a9 2
was no provision for recycling memory until end of query.  This needs to be
fixed, even more so with the advent of TOAST which will allow very large
d11 1
a11 1
the new memory management plan implemented in 7.1.
@


1.11
log
@mmgr README tweak: "either" is no longer correct. The previous wording
compared PortalContext with QueryContext, but the latter no longer exists.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/utils/mmgr/README,v 1.10 2007/03/13 00:33:42 tgl Exp $
d3 2
a4 2
Notes about memory allocation redesign
--------------------------------------
d56 1
a56 1
Some notes about the palloc API versus standard C library
d76 1
a76 1
pfree/repalloc no longer depend on CurrentMemoryContext
d104 1
a104 1
Additions to the memory-context mechanism
d133 1
a133 1
Globally known contexts
d217 1
a217 1
Contexts for prepared statements and portals
d233 1
a233 1
Transient contexts during execution
d321 1
a321 1
Mechanisms to allow multiple types of contexts
d392 1
a392 1
More control over aset.c behavior
d420 1
a420 1
Other notes
@


1.10
log
@First phase of plan-invalidation project: create a plan cache management
module and teach PREPARE and protocol-level prepared statements to use it.
In service of this, rearrange utility-statement processing so that parse
analysis does not assume table schemas can't change before execution for
utility statements (necessary because we don't attempt to re-acquire locks
for utility statements when reusing a stored plan).  This requires some
refactoring of the ProcessUtility API, but it ends up cleaner anyway,
for instance we can get rid of the QueryContext global.

Still to do: fix up SPI and related code to use the plan cache; I'm tempted to
try to make SQL functions use it too.  Also, there are at least some aspects
of system state that we want to ensure remain the same during a replan as in
the original processing; search_path certainly ought to behave that way for
instance, and perhaps there are others.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/utils/mmgr/README,v 1.9 2006/09/07 22:52:01 tgl Exp $
d204 1
a204 1
PortalContext --- this is not actually a separate context either, but a
@


1.9
log
@Clean up logging for extended-query-protocol operations, as per my recent
proposal.  Parameter logging works even for binary-format parameters, and
logging overhead is avoided when disabled.

log_statement = all output for the src/test/examples/testlibpq3.c example
now looks like

LOG:  statement: execute <unnamed>: SELECT * FROM test1 WHERE t = $1
DETAIL:  parameters: $1 = 'joe''s place'
LOG:  statement: execute <unnamed>: SELECT * FROM test1 WHERE i = $1::int4
DETAIL:  parameters: $1 = '2'

and log_min_duration_statement = 0 results in

LOG:  duration: 2.431 ms  parse <unnamed>: SELECT * FROM test1 WHERE t = $1
LOG:  duration: 2.335 ms  bind <unnamed> to <unnamed>: SELECT * FROM test1 WHERE t = $1
DETAIL:  parameters: $1 = 'joe''s place'
LOG:  duration: 0.394 ms  execute <unnamed>: SELECT * FROM test1 WHERE t = $1
DETAIL:  parameters: $1 = 'joe''s place'
LOG:  duration: 1.251 ms  parse <unnamed>: SELECT * FROM test1 WHERE i = $1::int4
LOG:  duration: 0.566 ms  bind <unnamed> to <unnamed>: SELECT * FROM test1 WHERE i = $1::int4
DETAIL:  parameters: $1 = '2'
LOG:  duration: 0.173 ms  execute <unnamed>: SELECT * FROM test1 WHERE i = $1::int4
DETAIL:  parameters: $1 = '2'

(This example demonstrates the folly of ignoring parse/bind steps for duration
logging purposes, BTW.)

Along the way, create a less ad-hoc mechanism for determining which commands
are logged by log_statement = mod and log_statement = ddl.  The former coding
was actually missing quite a few things that look like ddl to me, and it
did not handle EXECUTE or extended query protocol correctly at all.

This commit does not do anything about the question of whether log_duration
should be removed or made less redundant with log_min_duration_statement.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/utils/mmgr/README,v 1.8 2004/08/04 21:34:04 tgl Exp $
a203 9
QueryContext --- this is not actually a separate context, but a global
variable pointing to the context that holds the current command's parse tree.
(In simple-Query mode this points to MessageContext; when executing a
prepared statement it will point to the prepared statement's private context.
Note that the plan tree may or may not be in this same context.)
Generally it is not appropriate for any code to use QueryContext as an
allocation target --- from the point of view of any code that would be
referencing the QueryContext variable, it's a read-only context.

d223 1
a223 3
times without further copying of these trees.  QueryContext points at this
private context while executing any portal built from the prepared
statement.
@


1.8
log
@Label CVS tip as 8.0devel instead of 7.5devel.  Adjust various comments
and documentation to reference 8.0 instead of 7.5.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql-server/src/backend/utils/mmgr/README,v 1.7 2004/07/01 00:51:29 tgl Exp $
d205 7
a211 7
variable pointing to the context that holds the current command's parse
and plan trees.  (In simple-Query mode this points to MessageContext;
when executing a prepared statement it will point at the prepared
statement's private context.)  Generally it is not appropriate for any
code to use QueryContext as an allocation target --- from the point of
view of any code that would be referencing the QueryContext variable,
it's a read-only context.
@


1.7
log
@Nested transactions.  There is still much left to do, especially on the
performance front, but with feature freeze upon us I think it's time to
drive a stake in the ground and say that this will be in 7.5.

Alvaro Herrera, with some help from Tom Lane.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql-server/src/backend/utils/mmgr/README,v 1.6 2004/06/05 19:48:09 tgl Exp $
d69 1
a69 1
error.)  (Note: this behavior is new in Postgres 7.5; earlier versions
@


1.6
log
@Tweak palloc/repalloc to allow zero bytes to be requested, as per recent
proposal.  Eliminate several dozen now-unnecessary hacks to avoid palloc(0).
(It's likely there are more that I didn't find.)
@
text
@d1 1
a1 1
$PostgreSQL: pgsql-server/src/backend/utils/mmgr/README,v 1.5 2003/11/29 19:52:04 pgsql Exp $
d93 1
a93 1
we'd have to pass an apppropriate memory context to called routines in
d179 24
a202 9
TopTransactionContext --- this holds everything that lives until end of
transaction (longer than one statement within a transaction!).  An example
of what has to be here is the list of pending NOTIFY messages to be sent
at xact commit.  This context will be reset, and all its children deleted,
at conclusion of each transaction cycle.  Note: this context is NOT
cleared immediately upon error; its contents will survive until the
transaction block is exited by COMMIT/ROLLBACK.
(If we ever implement nested transactions, TopTransactionContext may need
to be split into a true "top" pointer and a "current transaction" pointer.)
@


1.5
log
@
$Header: -> $PostgreSQL Changes ...
@
text
@d1 1
a1 1
$PostgreSQL: /cvsroot/pgsql-server/src/backend/utils/mmgr/README,v 1.4 2003/04/30 19:04:12 tgl Exp $
d54 20
@


1.4
log
@Update to describe new set of globally-known contexts planned for support
of extended query features in new FE/BE protocol.  TransactionCommandContext
is gone (PortalContext replaces it for some purposes), and QueryContext
has taken on a new meaning (MessageContext plays its old role).
@
text
@d1 1
a1 1
$Header: /cvsroot/pgsql-server/src/backend/utils/mmgr/README,v 1.3 2001/02/15 21:38:26 tgl Exp $
@


1.3
log
@Update notes about memory context scheme.
@
text
@d1 1
a1 1
$Header$
d113 2
a114 2
Top-level contexts
------------------
d116 2
a117 2
There will be several top-level contexts --- these contexts have no parent
and will be referenced by global variables.  At any instant the system may
d119 12
a130 11
or indirect children of one of the top-level contexts to ensure they are
not leaked in event of an error.  I presently envision these top-level
contexts:

TopMemoryContext --- allocating here is essentially the same as "malloc",
because this context will never be reset or deleted.  This is for stuff
that should live forever, or for stuff that you know you will delete
at the appropriate time.  An example is fd.c's tables of open files,
as well as the context management nodes for memory contexts themselves.
Avoid allocating stuff here unless really necessary, and especially
avoid running with CurrentMemoryContext pointing here.
d136 3
a138 3
TopMemoryContext.  The postmaster will probably have only TopMemoryContext,
PostmasterContext, and possibly ErrorContext --- the remaining top-level
contexts will be set up in each backend during startup.)
d144 14
a157 16
(Note: CacheMemoryContext may well have child-contexts with shorter
lifespans.  For example, a child context seems like the best place to
keep the subsidiary storage associated with a relcache entry; that way
we can free rule parsetrees and so forth easily, without having to depend
on constructing a reliable version of freeObject().)

QueryContext --- this is where the storage holding a received query string
is kept, as well as storage that should live as long as the query string,
notably the parsetree constructed from it.  This context will be reset at
the top of each cycle of the outer loop of PostgresMain, thereby freeing
the old query and parsetree.  We must keep this separate from
TopTransactionContext because a query string might need to live either a
longer or shorter time than a transaction, depending on whether it
contains begin/end commands or not.  (This'll also fix the nasty bug that
"vacuum; anything else" crashes if submitted as a single query string,
because vacuum's xact commit frees the memory holding the parsetree...)
d163 44
a206 25
at conclusion of each transaction cycle.  Note: presently I envision that
this context will NOT be cleared immediately upon error; its contents
will survive anyway until the transaction block is exited by
COMMIT/ROLLBACK.  This seems appropriate since we want to move in the
direction of allowing a transaction to continue processing after an error.

TransactionCommandContext --- this is really a child of
TopTransactionContext, not a top-level context, but we'll probably store a
link to it in a global variable anyway for convenience.  All the memory
allocated during planning and execution lives here or in a child context.
This context is deleted at statement completion, whether normal completion
or error abort.

ErrorContext --- this permanent context will be switched into
for error recovery processing, and then reset on completion of recovery.
We'll arrange to have, say, 8K of memory available in it at all times.
In this way, we can ensure that some memory is available for error
recovery even if the backend has run out of memory otherwise.  This should
allow out-of-memory to be treated as a normal ERROR condition, not a FATAL
error.

If we ever implement nested transactions, there may need to be some
additional levels of transaction-local contexts between
TopTransactionContext and TransactionCommandContext, but that's beyond
the scope of this proposal.
d212 7
a218 4
The planner will probably have a transient context in which it stores
pathnodes; this will allow it to release the bulk of its temporary space
usage (which can be a lot, for large joins) at completion of planning.
The completed plan tree will be in TransactionCommandContext.
d221 7
a227 16
execution code, will normally run in a context with command lifetime.
(This will be TransactionCommandContext for normal queries, but when
executing a cursor, it will be a context associated with the cursor.)
Most of the memory allocated in these routines is intended to live until
end of query, so this is appropriate for those purposes.  We already have
a mechanism --- "tuple table slots" --- for avoiding leakage of tuples,
which is the major kind of short-lived data handled by these routines.
This still leaves a certain amount of explicit pfree'ing needed by plan
node code, but that code largely exists already and is probably not worth
trying to remove.  I looked at the possibility of running in a shorter-
lived context (such as a context that gets reset per-tuple), but this
seems fairly impractical.  The biggest problem with it is that code in
the index access routines, as well as some other complex algorithms like
tuplesort.c, assumes that palloc'd storage will live across tuples.
For example, rtree uses a palloc'd state stack to keep track of an index
scan.
d292 1
a292 1
of QueryContext and hence is certain to go away as part of normal
d386 8
a393 6
context resets.  This feature is needed for ErrorContext (see above).
It is also useful for per-tuple contexts, which will be reset frequently
and typically will not allocate very much space per tuple cycle.  We can
save a lot of unnecessary malloc traffic if these contexts hang onto one
allocation block rather than releasing and reacquiring the block on
each tuple cycle.
@


1.2
log
@Update implementation notes for new memory management logic.
@
text
@d1 3
a3 1
Notes about memory allocation redesign			14-Jul-2000
d6 1
a6 1
Up through version 7.0, Postgres has serious problems with memory leakage
d8 1
a8 1
is no provision for recycling memory until end of query.  This needs to be
d10 2
a11 1
chunks of data to be passed around in the system.  So, here is a proposal.
d200 5
a204 3
execution code, will normally run in TransactionCommandContext.  Much
of the memory allocated in these routines is intended to live until end
of query, so this is appropriate for those purposes.  We already have
d237 1
a237 1
A problem with running index accesses and sorts in TransactionMemoryContext
d241 1
a241 1
so there's no problem with their result data, but there could be a problem
d272 1
a272 3
copy step.  (Actually, Unique could use the same trick with two per-tuple
contexts as described above for Agg, but there will probably be other
cases where doing an extra copy step is the right thing.)
@


1.1
log
@First phase of memory management rewrite (see backend/utils/mmgr/README
for details).  It doesn't really do that much yet, since there are no
short-term memory contexts in the executor, but the infrastructure is
in place and long-term contexts are handled reasonably.  A few long-
standing bugs have been fixed, such as 'VACUUM; anything' in a single
query string crashing.  Also, out-of-memory is now considered a
recoverable ERROR, not FATAL.
Eliminate a large amount of crufty, now-dead code in and around
memory management.
Fix problem with holding off SIGTRAP, SIGSEGV, etc in postmaster and
backend startup.
@
text
@d1 2
a2 2
Proposal for memory allocation fixes, take 2		21-Jun-2000
--------------------------------------------
d4 3
a6 3
We know that Postgres has serious problems with memory leakage during
large queries that process a lot of pass-by-reference data.  There is
no provision for recycling memory until end of query.  This needs to be
d196 47
a242 24
The executor will have contexts with lifetime similar to plan nodes
(I'm not sure at the moment whether there's need for one such context
per plan level, or whether a single context is sufficient).  These
contexts will hold plan-node-local execution state and related items.
There will also be a context on each plan level that is reset at the start
of each tuple processing cycle.  This per-tuple context will be the normal
CurrentMemoryContext during evaluation of expressions and so forth.  By
resetting it, we reclaim transient memory that was used during processing
of the prior tuple.  That should be enough to solve the problem of running
out of memory on large queries.  We must have a per-tuple context in each
plan node, and we must reset it at the start of a tuple cycle rather than
the end, so that each plan node can use results of expression evaluation
as part of the tuple it returns to its parent node.

By resetting the per-tuple context, we will be able to free memory after
each tuple is processed, rather than only after the whole plan is
processed.  This should solve our memory leakage problems pretty well;
yet we do not need to add very much new bookkeeping logic to do it.
In particular, we do *not* need to try to keep track of individual values
palloc'd during expression evaluation.

Note we assume that resetting a context is a cheap operation.  This is
true already, and we can make it even more true with a little bit of
tuning in aset.c.
a390 12

It might be that the executor per-run contexts described above should
be tied directly to executor "EState" nodes, that is, one context per
EState.  I'm not real clear on the lifespan of EStates or the situations
where we have just one or more than one, so I'm not sure.  Comments?

It would probably be possible to adapt the existing "portal" memory
management mechanism to do what we need.  I am instead proposing setting
up a totally new mechanism, because the portal code strikes me as
extremely crufty and unwieldy.  It may be that we can eventually remove
portals entirely, or perhaps reimplement them with this mechanism
underneath.
@
