head	1.54;
access;
symbols
	REL9_0_0:1.53
	REL9_1_ALPHA1:1.54
	REL9_0_RC1:1.53
	REL9_0_BETA4:1.53
	REL9_0_STABLE:1.53.0.4
	REL9_0_BETA3:1.53
	REL9_0_BETA2:1.53
	REL7_4_29:1.29
	REL8_0_25:1.30
	REL8_1_21:1.33
	REL8_2_17:1.35.2.2
	REL8_3_11:1.41.2.1
	REL8_4_4:1.49.2.2
	REL9_0_BETA1:1.53
	REL9_0_ALPHA5_BRANCH:1.53.0.2
	REL9_0_ALPHA5:1.53
	REL7_4_28:1.29
	REL8_0_24:1.30
	REL8_1_20:1.33
	REL8_2_16:1.35.2.2
	REL8_3_10:1.41.2.1
	REL8_4_3:1.49.2.2
	REL9_0_ALPHA4:1.52
	REL9_0_ALPHA4_BRANCH:1.52.0.6
	REL8_5_ALPHA3:1.52
	REL8_5_ALPHA3_BRANCH:1.52.0.4
	REL7_4_27:1.29
	REL8_0_23:1.30
	REL8_1_19:1.33
	REL8_2_15:1.35.2.2
	REL8_3_9:1.41.2.1
	REL8_4_2:1.49.2.2
	REL8_5_ALPHA2:1.52
	REL8_5_ALPHA2_BRANCH:1.52.0.2
	REL7_4_26:1.29
	REL8_0_22:1.30
	REL8_1_18:1.33
	REL8_2_14:1.35.2.2
	REL8_3_8:1.41
	REL8_4_1:1.49.2.1
	REL8_5_ALPHA1:1.50
	REL8_5_ALPHA1_BRANCH:1.50.0.2
	REL8_4_STABLE:1.49.0.2
	REL8_4_0:1.49
	REL8_4_RC2:1.49
	REL8_4_RC1:1.49
	REL8_4_BETA2:1.49
	REL8_4_BETA1:1.49
	REL7_4_25:1.29
	REL8_0_21:1.30
	REL8_1_17:1.33
	REL8_2_13:1.35.2.2
	REL8_3_7:1.41
	REL7_4_24:1.29
	REL8_0_20:1.30
	REL8_1_16:1.33
	REL8_2_12:1.35.2.2
	REL8_3_6:1.41
	REL7_4_23:1.29
	REL8_0_19:1.30
	REL8_1_15:1.33
	REL8_2_11:1.35.2.2
	REL8_3_5:1.41
	REL7_4_22:1.29
	REL8_0_18:1.30
	REL8_1_14:1.33
	REL8_2_10:1.35.2.2
	REL8_3_4:1.41
	REL7_4_21:1.29
	REL8_0_17:1.30
	REL8_1_13:1.33
	REL8_2_9:1.35.2.2
	REL8_3_3:1.41
	REL7_4_20:1.29
	REL8_0_16:1.30
	REL8_1_12:1.33
	REL8_2_8:1.35.2.2
	REL8_3_2:1.41
	REL8_2_7:1.35.2.2
	REL8_3_1:1.41
	REL8_3_STABLE:1.41.0.2
	REL8_3_0:1.41
	REL8_3_RC2:1.41
	REL7_3_21:1.25
	REL7_4_19:1.29
	REL8_0_15:1.30
	REL8_1_11:1.33
	REL8_2_6:1.35.2.2
	REL8_3_RC1:1.41
	REL8_3_BETA4:1.41
	REL8_3_BETA3:1.41
	REL8_3_BETA2:1.41
	REL8_3_BETA1:1.40
	REL7_3_20:1.25
	REL7_4_18:1.29
	REL8_0_14:1.30
	REL8_1_10:1.33
	REL8_2_5:1.35.2.1
	REL7_3_19:1.25
	REL7_4_17:1.29
	REL8_0_13:1.30
	REL8_1_9:1.33
	REL8_2_4:1.35.2.1
	REL8_0_12:1.30
	REL8_1_8:1.33
	REL8_2_3:1.35
	REL7_3_18:1.25
	REL7_4_16:1.29
	REL8_0_11:1.30
	REL8_1_7:1.33
	REL8_2_2:1.35
	REL8_0_10:1.30
	REL8_1_6:1.33
	REL8_2_1:1.35
	REL7_4_15:1.29
	REL7_3_17:1.25
	REL8_2_STABLE:1.35.0.2
	REL8_2_0:1.35
	REL8_2_RC1:1.35
	REL8_2_BETA3:1.35
	REL8_2_BETA2:1.35
	REL8_1_5:1.33
	REL8_0_9:1.30
	REL7_4_14:1.29
	REL7_3_16:1.25
	REL8_2_BETA1:1.35
	REL7_3_15:1.25
	REL7_4_13:1.29
	REL8_0_8:1.30
	REL8_1_4:1.33
	REL7_3_14:1.25
	REL7_4_12:1.29
	REL8_0_7:1.30
	REL8_1_3:1.33
	REL7_3_13:1.25
	REL7_4_11:1.29
	REL8_0_6:1.30
	REL8_1_2:1.33
	REL7_3_12:1.25
	REL7_4_10:1.29
	REL8_0_5:1.30
	REL8_1_1:1.33
	REL8_1_STABLE:1.33.0.2
	REL8_1_0:1.33
	REL8_1_0RC1:1.33
	REL8_1_0BETA4:1.33
	REL8_1_0BETA3:1.33
	REL7_3_11:1.25
	REL7_4_9:1.29
	REL8_0_4:1.30
	REL8_1_0BETA2:1.33
	REL8_1_0BETA1:1.33
	REL7_2_8:1.23
	REL7_3_10:1.25
	REL7_4_8:1.29
	REL8_0_3:1.30
	REL8_0_2:1.30
	REL7_2_7:1.23
	REL7_3_9:1.25
	REL7_4_7:1.29
	REL8_0_1:1.30
	REL8_0_STABLE:1.30.0.4
	REL8_0_0:1.30.0.2
	REL8_0_0RC5:1.30
	REL8_0_0RC4:1.30
	REL8_0_0RC3:1.30
	REL8_0_0RC2:1.30
	REL8_0_0RC1:1.30
	REL8_0_0BETA5:1.30
	REL8_0_0BETA4:1.30
	REL7_4_6:1.29
	REL7_3_8:1.25
	REL7_2_6:1.23
	REL8_0_0BETA3:1.30
	REL8_0_0BETA2:1.30
	REL7_2_5:1.23
	REL7_4_5:1.29
	REL7_3_7:1.25
	REL7_4_4:1.29
	REL8_0_0BETA1:1.30
	REL7_4_3:1.29
	REL7_4_2:1.29
	REL7_3_6:1.25
	REL7_4_1:1.29
	REL7_3_5:1.25
	REL7_4:1.29
	REL7_4_RC2:1.29
	REL7_4_STABLE:1.29.0.4
	REL7_4_RC1:1.29
	REL7_4_BETA5:1.29
	REL7_4_BETA4:1.29
	REL7_4_BETA3:1.29
	REL7_4_BETA2:1.29
	WIN32_DEV:1.29.0.2
	REL7_4_BETA1:1.29
	REL7_3_4:1.25
	REL7_3_2:1.25
	REL7_2_4:1.23
	REL7_3_STABLE:1.25.0.2
	REL7_2_3:1.23
	REL7_2_STABLE:1.23.0.2
	REL7_2:1.23
	REL7_2_RC2:1.23
	REL7_2_RC1:1.23
	REL7_2_BETA5:1.23
	REL7_2_BETA4:1.23
	REL7_2_BETA3:1.23
	REL7_2_BETA2:1.23
	REL7_2_BETA1:1.23
	REL7_1_2:1.22
	REL7_1_STABLE:1.22.0.2
	REL7_1_BETA:1.20
	REL7_1_BETA3:1.21
	REL7_1_BETA2:1.21
	REL7_1:1.22
	REL7_0_PATCHES:1.16.0.2
	REL7_0:1.16
	REL6_5_PATCHES:1.12.0.2
	REL6_5:1.12
	REL6_4:1.4.0.2
	release-6-3:1.2;
locks; strict;
comment	@# @;


1.54
date	2010.08.19.05.57.34;	author petere;	state Exp;
branches;
next	1.53;

1.53
date	2010.03.28.22.59.32;	author tgl;	state Exp;
branches;
next	1.52;

1.52
date	2009.09.29.01.20.34;	author tgl;	state Exp;
branches;
next	1.51;

1.51
date	2009.09.17.20.49.28;	author tgl;	state Exp;
branches;
next	1.50;

1.50
date	2009.07.21.02.02.44;	author tgl;	state Exp;
branches;
next	1.49;

1.49
date	2009.02.27.22.41.37;	author tgl;	state Exp;
branches
	1.49.2.1;
next	1.48;

1.48
date	2008.08.14.18.47.59;	author tgl;	state Exp;
branches;
next	1.47;

1.47
date	2008.08.02.21.31.59;	author tgl;	state Exp;
branches;
next	1.46;

1.46
date	2008.04.09.01.00.46;	author momjian;	state Exp;
branches;
next	1.45;

1.45
date	2008.04.09.00.59.24;	author momjian;	state Exp;
branches;
next	1.44;

1.44
date	2008.04.09.00.55.30;	author momjian;	state Exp;
branches;
next	1.43;

1.43
date	2008.03.21.13.23.28;	author momjian;	state Exp;
branches;
next	1.42;

1.42
date	2008.03.20.17.55.14;	author momjian;	state Exp;
branches;
next	1.41;

1.41
date	2007.10.26.18.10.50;	author tgl;	state Exp;
branches
	1.41.2.1;
next	1.40;

1.40
date	2007.09.26.18.51.50;	author tgl;	state Exp;
branches;
next	1.39;

1.39
date	2007.02.19.07.03.27;	author tgl;	state Exp;
branches;
next	1.38;

1.38
date	2007.02.16.00.14.01;	author tgl;	state Exp;
branches;
next	1.37;

1.37
date	2007.02.13.02.31.02;	author tgl;	state Exp;
branches;
next	1.36;

1.36
date	2007.01.20.20.45.38;	author tgl;	state Exp;
branches;
next	1.35;

1.35
date	2006.07.01.18.38.32;	author tgl;	state Exp;
branches
	1.35.2.1;
next	1.34;

1.34
date	2005.12.20.02.30.35;	author tgl;	state Exp;
branches;
next	1.33;

1.33
date	2005.06.09.04.18.59;	author tgl;	state Exp;
branches;
next	1.32;

1.32
date	2005.06.05.22.32.54;	author tgl;	state Exp;
branches;
next	1.31;

1.31
date	2005.04.21.19.18.12;	author tgl;	state Exp;
branches;
next	1.30;

1.30
date	2003.12.30.21.49.19;	author tgl;	state Exp;
branches;
next	1.29;

1.29
date	2003.01.20.18.54.47;	author tgl;	state Exp;
branches;
next	1.28;

1.28
date	2003.01.15.19.35.39;	author tgl;	state Exp;
branches;
next	1.27;

1.27
date	2002.11.30.05.21.02;	author tgl;	state Exp;
branches;
next	1.26;

1.26
date	2002.11.06.00.00.44;	author tgl;	state Exp;
branches;
next	1.25;

1.25
date	2002.08.25.22.39.37;	author momjian;	state Exp;
branches;
next	1.24;

1.24
date	2002.05.12.23.43.02;	author tgl;	state Exp;
branches;
next	1.23;

1.23
date	2001.10.18.16.11.41;	author tgl;	state Exp;
branches;
next	1.22;

1.22
date	2001.01.17.06.41.31;	author momjian;	state Exp;
branches;
next	1.21;

1.21
date	2000.12.14.22.30.45;	author tgl;	state Exp;
branches;
next	1.20;

1.20
date	2000.11.12.00.37.02;	author tgl;	state Exp;
branches;
next	1.19;

1.19
date	2000.09.29.18.21.30;	author tgl;	state Exp;
branches;
next	1.18;

1.18
date	2000.09.12.21.06.50;	author tgl;	state Exp;
branches;
next	1.17;

1.17
date	2000.07.24.03.10.55;	author tgl;	state Exp;
branches;
next	1.16;

1.16
date	2000.03.21.05.11.55;	author tgl;	state Exp;
branches;
next	1.15;

1.15
date	2000.02.15.20.49.31;	author tgl;	state Exp;
branches;
next	1.14;

1.14
date	2000.02.07.04.41.04;	author tgl;	state Exp;
branches;
next	1.13;

1.13
date	99.08.16.02.17.47;	author tgl;	state Exp;
branches;
next	1.12;

1.12
date	99.02.19.05.18.03;	author momjian;	state Exp;
branches;
next	1.11;

1.11
date	99.02.19.02.05.14;	author momjian;	state Exp;
branches;
next	1.10;

1.10
date	99.02.15.22.19.01;	author momjian;	state Exp;
branches;
next	1.9;

1.9
date	99.02.09.03.51.14;	author momjian;	state Exp;
branches;
next	1.8;

1.8
date	99.02.08.04.29.05;	author momjian;	state Exp;
branches;
next	1.7;

1.7
date	99.02.04.03.19.07;	author momjian;	state Exp;
branches;
next	1.6;

1.6
date	99.02.04.01.46.56;	author momjian;	state Exp;
branches;
next	1.5;

1.5
date	99.02.03.20.15.23;	author momjian;	state Exp;
branches;
next	1.4;

1.4
date	98.08.10.02.26.14;	author momjian;	state Exp;
branches;
next	1.3;

1.3
date	98.08.07.05.02.12;	author momjian;	state Exp;
branches;
next	1.2;

1.2
date	97.12.18.12.20.20;	author momjian;	state Exp;
branches;
next	1.1;

1.1
date	97.12.17.18.02.33;	author momjian;	state Exp;
branches;
next	;

1.35.2.1
date	2007.02.13.02.31.11;	author tgl;	state Exp;
branches;
next	1.35.2.2;

1.35.2.2
date	2007.10.26.18.10.58;	author tgl;	state Exp;
branches;
next	;

1.41.2.1
date	2009.09.29.01.21.02;	author tgl;	state Exp;
branches;
next	;

1.49.2.1
date	2009.07.21.02.02.51;	author tgl;	state Exp;
branches;
next	1.49.2.2;

1.49.2.2
date	2009.09.29.01.20.53;	author tgl;	state Exp;
branches;
next	;


desc
@@


1.54
log
@Remove extra newlines at end and beginning of files, add missing newlines
at end of files.
@
text
@$PostgreSQL: pgsql/src/backend/optimizer/README,v 1.53 2010/03/28 22:59:32 tgl Exp $

Optimizer
=========

These directories take the Query structure returned by the parser, and
generate a plan used by the executor.  The /plan directory generates the
actual output plan, the /path code generates all possible ways to join the
tables, and /prep handles various preprocessing steps for special cases.
/util is utility stuff.  /geqo is the separate "genetic optimization" planner
--- it does a semi-random search through the join tree space, rather than
exhaustively considering all possible join trees.  (But each join considered
by /geqo is given to /path to create paths for, so we consider all possible
implementation paths for each specific join pair even in GEQO mode.)


Paths and Join Pairs
--------------------

During the planning/optimizing process, we build "Path" trees representing
the different ways of doing a query.  We select the cheapest Path that
generates the desired relation and turn it into a Plan to pass to the
executor.  (There is pretty much a one-to-one correspondence between the
Path and Plan trees, but Path nodes omit info that won't be needed during
planning, and include info needed for planning that won't be needed by the
executor.)

The optimizer builds a RelOptInfo structure for each base relation used in
the query.  Base rels are either primitive tables, or subquery subselects
that are planned via a separate recursive invocation of the planner.  A
RelOptInfo is also built for each join relation that is considered during
planning.  A join rel is simply a combination of base rels.  There is only
one join RelOptInfo for any given set of baserels --- for example, the join
{A B C} is represented by the same RelOptInfo no matter whether we build it
by joining A and B first and then adding C, or joining B and C first and
then adding A, etc.  These different means of building the joinrel are
represented as Paths.  For each RelOptInfo we build a list of Paths that
represent plausible ways to implement the scan or join of that relation.
Once we've considered all the plausible Paths for a rel, we select the one
that is cheapest according to the planner's cost estimates.  The final plan
is derived from the cheapest Path for the RelOptInfo that includes all the
base rels of the query.

Possible Paths for a primitive table relation include plain old sequential
scan, plus index scans for any indexes that exist on the table, plus bitmap
index scans using one or more indexes.  A subquery base relation just has
one Path, a "SubqueryScan" path (which links to the subplan that was built
by a recursive invocation of the planner).  Likewise a function-RTE base
relation has only one possible Path.

Joins always occur using two RelOptInfos.  One is outer, the other inner.
Outers drive lookups of values in the inner.  In a nested loop, lookups of
values in the inner occur by scanning the inner path once per outer tuple
to find each matching inner row.  In a mergejoin, inner and outer rows are
ordered, and are accessed in order, so only one scan is required to perform
the entire join: both inner and outer paths are scanned in-sync.  (There's
not a lot of difference between inner and outer in a mergejoin...)  In a
hashjoin, the inner is scanned first and all its rows are entered in a
hashtable, then the outer is scanned and for each row we lookup the join
key in the hashtable.

A Path for a join relation is actually a tree structure, with the top
Path node representing the join method.  It has left and right subpaths
that represent the scan or join methods used for the two input relations.


Join Tree Construction
----------------------

The optimizer generates optimal query plans by doing a more-or-less
exhaustive search through the ways of executing the query.  The best Path
tree is found by a recursive process:

1) Take each base relation in the query, and make a RelOptInfo structure
for it.  Find each potentially useful way of accessing the relation,
including sequential and index scans, and make Paths representing those
ways.  All the Paths made for a given relation are placed in its
RelOptInfo.pathlist.  (Actually, we discard Paths that are obviously
inferior alternatives before they ever get into the pathlist --- what
ends up in the pathlist is the cheapest way of generating each potentially
useful sort ordering of the relation.)  Also create a RelOptInfo.joininfo
list including all the join clauses that involve this relation.  For
example, the WHERE clause "tab1.col1 = tab2.col1" generates entries in
both tab1 and tab2's joininfo lists.

If we have only a single base relation in the query, we are done.
Otherwise we have to figure out how to join the base relations into a
single join relation.

2) Normally, any explicit JOIN clauses are "flattened" so that we just
have a list of relations to join.  However, FULL OUTER JOIN clauses are
never flattened, and other kinds of JOIN might not be either, if the
flattening process is stopped by join_collapse_limit or from_collapse_limit
restrictions.  Therefore, we end up with a planning problem that contains
lists of relations to be joined in any order, where any individual item
might be a sub-list that has to be joined together before we can consider
joining it to its siblings.  We process these sub-problems recursively,
bottom up.  Note that the join list structure constrains the possible join
orders, but it doesn't constrain the join implementation method at each
join (nestloop, merge, hash), nor does it say which rel is considered outer
or inner at each join.  We consider all these possibilities in building
Paths. We generate a Path for each feasible join method, and select the
cheapest Path.

For each planning problem, therefore, we will have a list of relations
that are either base rels or joinrels constructed per sub-join-lists.
We can join these rels together in any order the planner sees fit.
The standard (non-GEQO) planner does this as follows:

Consider joining each RelOptInfo to each other RelOptInfo for which there
is a usable joinclause, and generate a Path for each possible join method
for each such pair.  (If we have a RelOptInfo with no join clauses, we have
no choice but to generate a clauseless Cartesian-product join; so we
consider joining that rel to each other available rel.  But in the presence
of join clauses we will only consider joins that use available join
clauses.  Note that join-order restrictions induced by outer joins and
IN/EXISTS clauses are also checked, to ensure that we find a workable join
order in cases where those restrictions force a clauseless join to be done.)

If we only had two relations in the list, we are done: we just pick
the cheapest path for the join RelOptInfo.  If we had more than two, we now
need to consider ways of joining join RelOptInfos to each other to make
join RelOptInfos that represent more than two list items.

The join tree is constructed using a "dynamic programming" algorithm:
in the first pass (already described) we consider ways to create join rels
representing exactly two list items.  The second pass considers ways
to make join rels that represent exactly three list items; the next pass,
four items, etc.  The last pass considers how to make the final join
relation that includes all list items --- obviously there can be only one
join rel at this top level, whereas there can be more than one join rel
at lower levels.  At each level we use joins that follow available join
clauses, if possible, just as described for the first level.

For example:

    SELECT  *
    FROM    tab1, tab2, tab3, tab4
    WHERE   tab1.col = tab2.col AND
        tab2.col = tab3.col AND
        tab3.col = tab4.col

    Tables 1, 2, 3, and 4 are joined as:
    {1 2},{2 3},{3 4}
    {1 2 3},{2 3 4}
    {1 2 3 4}
    (other possibilities will be excluded for lack of join clauses)

    SELECT  *
    FROM    tab1, tab2, tab3, tab4
    WHERE   tab1.col = tab2.col AND
        tab1.col = tab3.col AND
        tab1.col = tab4.col

    Tables 1, 2, 3, and 4 are joined as:
    {1 2},{1 3},{1 4}
    {1 2 3},{1 3 4},{1 2 4}
    {1 2 3 4}

We consider left-handed plans (the outer rel of an upper join is a joinrel,
but the inner is always a single list item); right-handed plans (outer rel
is always a single item); and bushy plans (both inner and outer can be
joins themselves).  For example, when building {1 2 3 4} we consider
joining {1 2 3} to {4} (left-handed), {4} to {1 2 3} (right-handed), and
{1 2} to {3 4} (bushy), among other choices.  Although the jointree
scanning code produces these potential join combinations one at a time,
all the ways to produce the same set of joined base rels will share the
same RelOptInfo, so the paths produced from different join combinations
that produce equivalent joinrels will compete in add_path().

Once we have built the final join rel, we use either the cheapest path
for it or the cheapest path with the desired ordering (if that's cheaper
than applying a sort to the cheapest other path).

If the query contains one-sided outer joins (LEFT or RIGHT joins), or
IN or EXISTS WHERE clauses that were converted to joins, then some of
the possible join orders may be illegal.  These are excluded by having
join_is_legal consult a side list of such "special" joins to see
whether a proposed join is illegal.  (The same consultation allows it
to see which join style should be applied for a valid join, ie,
JOIN_INNER, JOIN_LEFT, etc.)


Valid OUTER JOIN Optimizations
------------------------------

The planner's treatment of outer join reordering is based on the following
identities:

1.	(A leftjoin B on (Pab)) innerjoin C on (Pac)
	= (A innerjoin C on (Pac)) leftjoin B on (Pab)

where Pac is a predicate referencing A and C, etc (in this case, clearly
Pac cannot reference B, or the transformation is nonsensical).

2.	(A leftjoin B on (Pab)) leftjoin C on (Pac)
	= (A leftjoin C on (Pac)) leftjoin B on (Pab)

3.	(A leftjoin B on (Pab)) leftjoin C on (Pbc)
	= A leftjoin (B leftjoin C on (Pbc)) on (Pab)

Identity 3 only holds if predicate Pbc must fail for all-null B rows
(that is, Pbc is strict for at least one column of B).  If Pbc is not
strict, the first form might produce some rows with nonnull C columns
where the second form would make those entries null.

RIGHT JOIN is equivalent to LEFT JOIN after switching the two input
tables, so the same identities work for right joins.

An example of a case that does *not* work is moving an innerjoin into or
out of the nullable side of an outer join:

	A leftjoin (B join C on (Pbc)) on (Pab)
	!= (A leftjoin B on (Pab)) join C on (Pbc)

SEMI joins work a little bit differently.  A semijoin can be reassociated
into or out of the lefthand side of another semijoin, left join, or
antijoin, but not into or out of the righthand side.  Likewise, an inner
join, left join, or antijoin can be reassociated into or out of the
lefthand side of a semijoin, but not into or out of the righthand side.

ANTI joins work approximately like LEFT joins, except that identity 3
fails if the join to C is an antijoin (even if Pbc is strict, and in
both the cases where the other join is a leftjoin and where it is an
antijoin).  So we can't reorder antijoins into or out of the RHS of a
leftjoin or antijoin, even if the relevant clause is strict.

The current code does not attempt to re-order FULL JOINs at all.
FULL JOIN ordering is enforced by not collapsing FULL JOIN nodes when
translating the jointree to "joinlist" representation.  Other types of
JOIN nodes are normally collapsed so that they participate fully in the
join order search.  To avoid generating illegal join orders, the planner
creates a SpecialJoinInfo node for each non-inner join, and join_is_legal
checks this list to decide if a proposed join is legal.

What we store in SpecialJoinInfo nodes are the minimum sets of Relids
required on each side of the join to form the outer join.  Note that
these are minimums; there's no explicit maximum, since joining other
rels to the OJ's syntactic rels may be legal.  Per identities 1 and 2,
non-FULL joins can be freely associated into the lefthand side of an
OJ, but in some cases they can't be associated into the righthand side.
So the restriction enforced by join_is_legal is that a proposed join
can't join a rel within or partly within an RHS boundary to one outside
the boundary, unless the join validly implements some outer join.
(To support use of identity 3, we have to allow cases where an apparent
violation of a lower OJ's RHS is committed while forming an upper OJ.
If this wouldn't in fact be legal, the upper OJ's minimum LHS or RHS
set must be expanded to include the whole of the lower OJ, thereby
preventing it from being formed before the lower OJ is.)


Pulling Up Subqueries
---------------------

As we described above, a subquery appearing in the range table is planned
independently and treated as a "black box" during planning of the outer
query.  This is necessary when the subquery uses features such as
aggregates, GROUP, or DISTINCT.  But if the subquery is just a simple
scan or join, treating the subquery as a black box may produce a poor plan
compared to considering it as part of the entire plan search space.
Therefore, at the start of the planning process the planner looks for
simple subqueries and pulls them up into the main query's jointree.

Pulling up a subquery may result in FROM-list joins appearing below the top
of the join tree.  Each FROM-list is planned using the dynamic-programming
search method described above.

If pulling up a subquery produces a FROM-list as a direct child of another
FROM-list, then we can merge the two FROM-lists together.  Once that's
done, the subquery is an absolutely integral part of the outer query and
will not constrain the join tree search space at all.  However, that could
result in unpleasant growth of planning time, since the dynamic-programming
search has runtime exponential in the number of FROM-items considered.
Therefore, we don't merge FROM-lists if the result would have too many
FROM-items in one list.


Optimizer Functions
-------------------

The primary entry point is planner().

planner()
 set up for recursive handling of subqueries
 do final cleanup after planning
-subquery_planner()
 pull up sublinks and subqueries from rangetable, if possible
 canonicalize qual
     Attempt to simplify WHERE clause to the most useful form; this includes
     flattening nested AND/ORs and detecting clauses that are duplicated in
     different branches of an OR.
 simplify constant expressions
 process sublinks
 convert Vars of outer query levels into Params
--grouping_planner()
  preprocess target list for non-SELECT queries
  handle UNION/INTERSECT/EXCEPT, GROUP BY, HAVING, aggregates,
	ORDER BY, DISTINCT, LIMIT
--query_planner()
   pull out constant quals, which can be used to gate execution of the
	whole plan (if any are found, we make a top-level Result node
	to do the gating)
   make list of base relations used in query
   split up the qual into restrictions (a=1) and joins (b=c)
   find qual clauses that enable merge and hash joins
----make_one_rel()
     set_base_rel_pathlist()
      find seqscan and all index paths for each base relation
      find selectivity of columns used in joins
     make_rel_from_joinlist()
      hand off join subproblems to a plugin, GEQO, or standard_join_search()
-----standard_join_search()
      call join_search_one_level() for each level of join tree needed
      join_search_one_level():
        For each joinrel of the prior level, do make_rels_by_clause_joins()
        if it has join clauses, or make_rels_by_clauseless_joins() if not.
        Also generate "bushy plan" joins between joinrels of lower levels.
      Back at standard_join_search(), apply set_cheapest() to extract the
      cheapest path for each newly constructed joinrel.
      Loop back if this wasn't the top join level.
   Back at query_planner:
    put back any constant quals by adding a Result node
 Back at grouping_planner:
 do grouping(GROUP)
 do aggregates
 make unique(DISTINCT)
 make sort(ORDER BY)
 make limit(LIMIT/OFFSET)


Optimizer Data Structures
-------------------------

PlannerGlobal   - global information for a single planner invocation

PlannerInfo     - information for planning a particular Query (we make
                  a separate PlannerInfo node for each sub-Query)

RelOptInfo      - a relation or joined relations

 RestrictInfo   - WHERE clauses, like "x = 3" or "y = z"
                  (note the same structure is used for restriction and
                   join clauses)

 Path           - every way to generate a RelOptInfo(sequential,index,joins)
  SeqScan       - a plain Path node with pathtype = T_SeqScan
  IndexPath     - index scans
  BitmapHeapPath - top of a bitmapped index scan
  TidPath       - scan by CTID
  AppendPath    - append multiple subpaths together
  ResultPath    - a Result plan node (used for FROM-less SELECT)
  MaterialPath  - a Material plan node
  UniquePath    - remove duplicate rows
  NestPath      - nested-loop joins
  MergePath     - merge joins
  HashPath      - hash joins

 EquivalenceClass - a data structure representing a set of values known equal

 PathKey        - a data structure representing the sort ordering of a path

The optimizer spends a good deal of its time worrying about the ordering
of the tuples returned by a path.  The reason this is useful is that by
knowing the sort ordering of a path, we may be able to use that path as
the left or right input of a mergejoin and avoid an explicit sort step.
Nestloops and hash joins don't really care what the order of their inputs
is, but mergejoin needs suitably ordered inputs.  Therefore, all paths
generated during the optimization process are marked with their sort order
(to the extent that it is known) for possible use by a higher-level merge.

It is also possible to avoid an explicit sort step to implement a user's
ORDER BY clause if the final path has the right ordering already, so the
sort ordering is of interest even at the top level.  query_planner() will
look for the cheapest path with a sort order matching the desired order,
and grouping_planner() will compare its cost to the cost of using the
cheapest-overall path and doing an explicit sort.

When we are generating paths for a particular RelOptInfo, we discard a path
if it is more expensive than another known path that has the same or better
sort order.  We will never discard a path that is the only known way to
achieve a given sort order (without an explicit sort, that is).  In this
way, the next level up will have the maximum freedom to build mergejoins
without sorting, since it can pick from any of the paths retained for its
inputs.


EquivalenceClasses
------------------

During the deconstruct_jointree() scan of the query's qual clauses, we look
for mergejoinable equality clauses A = B whose applicability is not delayed
by an outer join; these are called "equivalence clauses".  When we find
one, we create an EquivalenceClass containing the expressions A and B to
record this knowledge.  If we later find another equivalence clause B = C,
we add C to the existing EquivalenceClass for {A B}; this may require
merging two existing EquivalenceClasses.  At the end of the scan, we have
sets of values that are known all transitively equal to each other.  We can
therefore use a comparison of any pair of the values as a restriction or
join clause (when these values are available at the scan or join, of
course); furthermore, we need test only one such comparison, not all of
them.  Therefore, equivalence clauses are removed from the standard qual
distribution process.  Instead, when preparing a restriction or join clause
list, we examine each EquivalenceClass to see if it can contribute a
clause, and if so we select an appropriate pair of values to compare.  For
example, if we are trying to join A's relation to C's, we can generate the
clause A = C, even though this appeared nowhere explicitly in the original
query.  This may allow us to explore join paths that otherwise would have
been rejected as requiring Cartesian-product joins.

Sometimes an EquivalenceClass may contain a pseudo-constant expression
(i.e., one not containing Vars or Aggs of the current query level, nor
volatile functions).  In this case we do not follow the policy of
dynamically generating join clauses: instead, we dynamically generate
restriction clauses "var = const" wherever one of the variable members of
the class can first be computed.  For example, if we have A = B and B = 42,
we effectively generate the restriction clauses A = 42 and B = 42, and then
we need not bother with explicitly testing the join clause A = B when the
relations are joined.  In effect, all the class members can be tested at
relation-scan level and there's never a need for join tests.

The precise technical interpretation of an EquivalenceClass is that it
asserts that at any plan node where more than one of its member values
can be computed, output rows in which the values are not all equal may
be discarded without affecting the query result.  (We require all levels
of the plan to enforce EquivalenceClasses, hence a join need not recheck
equality of values that were computable by one of its children.)  For an
ordinary EquivalenceClass that is "valid everywhere", we can further infer
that the values are all non-null, because all mergejoinable operators are
strict.  However, we also allow equivalence clauses that appear below the
nullable side of an outer join to form EquivalenceClasses; for these
classes, the interpretation is that either all the values are equal, or
all (except pseudo-constants) have gone to null.  (This requires a
limitation that non-constant members be strict, else they might not go
to null when the other members do.)  Consider for example

	SELECT *
	  FROM a LEFT JOIN
	       (SELECT * FROM b JOIN c ON b.y = c.z WHERE b.y = 10) ss
	       ON a.x = ss.y
	  WHERE a.x = 42;

We can form the below-outer-join EquivalenceClass {b.y c.z 10} and thereby
apply c.z = 10 while scanning c.  (The reason we disallow outerjoin-delayed
clauses from forming EquivalenceClasses is exactly that we want to be able
to push any derived clauses as far down as possible.)  But once above the
outer join it's no longer necessarily the case that b.y = 10, and thus we
cannot use such EquivalenceClasses to conclude that sorting is unnecessary
(see discussion of PathKeys below).

In this example, notice also that a.x = ss.y (really a.x = b.y) is not an
equivalence clause because its applicability to b is delayed by the outer
join; thus we do not try to insert b.y into the equivalence class {a.x 42}.
But since we see that a.x has been equated to 42 above the outer join, we
are able to form a below-outer-join class {b.y 42}; this restriction can be
added because no b/c row not having b.y = 42 can contribute to the result
of the outer join, and so we need not compute such rows.  Now this class
will get merged with {b.y c.z 10}, leading to the contradiction 10 = 42,
which lets the planner deduce that the b/c join need not be computed at all
because none of its rows can contribute to the outer join.  (This gets
implemented as a gating Result filter, since more usually the potential
contradiction involves Param values rather than just Consts, and thus has
to be checked at runtime.)

To aid in determining the sort ordering(s) that can work with a mergejoin,
we mark each mergejoinable clause with the EquivalenceClasses of its left
and right inputs.  For an equivalence clause, these are of course the same
EquivalenceClass.  For a non-equivalence mergejoinable clause (such as an
outer-join qualification), we generate two separate EquivalenceClasses for
the left and right inputs.  This may result in creating single-item
equivalence "classes", though of course these are still subject to merging
if other equivalence clauses are later found to bear on the same
expressions.

Another way that we may form a single-item EquivalenceClass is in creation
of a PathKey to represent a desired sort order (see below).  This is a bit
different from the above cases because such an EquivalenceClass might
contain an aggregate function or volatile expression.  (A clause containing
a volatile function will never be considered mergejoinable, even if its top
operator is mergejoinable, so there is no way for a volatile expression to
get into EquivalenceClasses otherwise.  Aggregates are disallowed in WHERE
altogether, so will never be found in a mergejoinable clause.)  This is just
a convenience to maintain a uniform PathKey representation: such an
EquivalenceClass will never be merged with any other.  Note in particular
that a single-item EquivalenceClass {a.x} is *not* meant to imply an
assertion that a.x = a.x; the practical effect of this is that a.x could
be NULL.

An EquivalenceClass also contains a list of btree opfamily OIDs, which
determines what the equalities it represents actually "mean".  All the
equivalence clauses that contribute to an EquivalenceClass must have
equality operators that belong to the same set of opfamilies.  (Note: most
of the time, a particular equality operator belongs to only one family, but
it's possible that it belongs to more than one.  We keep track of all the
families to ensure that we can make use of an index belonging to any one of
the families for mergejoin purposes.)


PathKeys
--------

The PathKeys data structure represents what is known about the sort order
of the tuples generated by a particular Path.  A path's pathkeys field is a
list of PathKey nodes, where the n'th item represents the n'th sort key of
the result.  Each PathKey contains these fields:

	* a reference to an EquivalenceClass
	* a btree opfamily OID (must match one of those in the EC)
	* a sort direction (ascending or descending)
	* a nulls-first-or-last flag

The EquivalenceClass represents the value being sorted on.  Since the
various members of an EquivalenceClass are known equal according to the
opfamily, we can consider a path sorted by any one of them to be sorted by
any other too; this is what justifies referencing the whole
EquivalenceClass rather than just one member of it.

In single/base relation RelOptInfo's, the Paths represent various ways
of scanning the relation and the resulting ordering of the tuples.
Sequential scan Paths have NIL pathkeys, indicating no known ordering.
Index scans have Path.pathkeys that represent the chosen index's ordering,
if any.  A single-key index would create a single-PathKey list, while a
multi-column index generates a list with one element per index column.
(Actually, since an index can be scanned either forward or backward, there
are two possible sort orders and two possible PathKey lists it can
generate.)

Note that a bitmap scan or multi-pass indexscan (OR clause scan) has NIL
pathkeys since we can say nothing about the overall order of its result.
Also, an indexscan on an unordered type of index generates NIL pathkeys.
However, we can always create a pathkey by doing an explicit sort.  The
pathkeys for a Sort plan's output just represent the sort key fields and
the ordering operators used.

Things get more interesting when we consider joins.  Suppose we do a
mergejoin between A and B using the mergeclause A.X = B.Y.  The output
of the mergejoin is sorted by X --- but it is also sorted by Y.  Again,
this can be represented by a PathKey referencing an EquivalenceClass
containing both X and Y.

With a little further thought, it becomes apparent that nestloop joins
can also produce sorted output.  For example, if we do a nestloop join
between outer relation A and inner relation B, then any pathkeys relevant
to A are still valid for the join result: we have not altered the order of
the tuples from A.  Even more interesting, if there was an equivalence clause
A.X=B.Y, and A.X was a pathkey for the outer relation A, then we can assert
that B.Y is a pathkey for the join result; X was ordered before and still
is, and the joined values of Y are equal to the joined values of X, so Y
must now be ordered too.  This is true even though we used neither an
explicit sort nor a mergejoin on Y.  (Note: hash joins cannot be counted
on to preserve the order of their outer relation, because the executor
might decide to "batch" the join, so we always set pathkeys to NIL for
a hashjoin path.)  Exception: a RIGHT or FULL join doesn't preserve the
ordering of its outer relation, because it might insert nulls at random
points in the ordering.

In general, we can justify using EquivalenceClasses as the basis for
pathkeys because, whenever we scan a relation containing multiple
EquivalenceClass members or join two relations each containing
EquivalenceClass members, we apply restriction or join clauses derived from
the EquivalenceClass.  This guarantees that any two values listed in the
EquivalenceClass are in fact equal in all tuples emitted by the scan or
join, and therefore that if the tuples are sorted by one of the values,
they can be considered sorted by any other as well.  It does not matter
whether the test clause is used as a mergeclause, or merely enforced
after-the-fact as a qpqual filter.

Note that there is no particular difficulty in labeling a path's sort
order with a PathKey referencing an EquivalenceClass that contains
variables not yet joined into the path's output.  We can simply ignore
such entries as not being relevant (yet).  This makes it possible to
use the same EquivalenceClasses throughout the join planning process.
In fact, by being careful not to generate multiple identical PathKey
objects, we can reduce comparison of EquivalenceClasses and PathKeys
to simple pointer comparison, which is a huge savings because add_path
has to make a large number of PathKey comparisons in deciding whether
competing Paths are equivalently sorted.

Pathkeys are also useful to represent an ordering that we wish to achieve,
since they are easily compared to the pathkeys of a potential candidate
path.  So, SortGroupClause lists are turned into pathkeys lists for use
inside the optimizer.

Because we have to generate pathkeys lists from the sort clauses before
we've finished EquivalenceClass merging, we cannot use the pointer-equality
method of comparing PathKeys in the earliest stages of the planning
process.  Instead, we generate "non canonical" PathKeys that reference
single-element EquivalenceClasses that might get merged later.  After we
complete EquivalenceClass merging, we replace these with "canonical"
PathKeys that reference only fully-merged classes, and after that we make
sure we don't generate more than one copy of each "canonical" PathKey.
Then it is safe to use pointer comparison on canonical PathKeys.

An additional refinement we can make is to insist that canonical pathkey
lists (sort orderings) do not mention the same EquivalenceClass more than
once.  For example, in all these cases the second sort column is redundant,
because it cannot distinguish values that are the same according to the
first sort column:
	SELECT ... ORDER BY x, x
	SELECT ... ORDER BY x, x DESC
	SELECT ... WHERE x = y ORDER BY x, y
Although a user probably wouldn't write "ORDER BY x,x" directly, such
redundancies are more probable once equivalence classes have been
considered.  Also, the system may generate redundant pathkey lists when
computing the sort ordering needed for a mergejoin.  By eliminating the
redundancy, we save time and improve planning, since the planner will more
easily recognize equivalent orderings as being equivalent.

Another interesting property is that if the underlying EquivalenceClass
contains a constant and is not below an outer join, then the pathkey is
completely redundant and need not be sorted by at all!  Every row must
contain the same constant value, so there's no need to sort.  (If the EC is
below an outer join, we still have to sort, since some of the rows might
have gone to null and others not.  In this case we must be careful to pick
a non-const member to sort by.  The assumption that all the non-const
members go to null at the same plan level is critical here, else they might
not produce the same sort order.)  This might seem pointless because users
are unlikely to write "... WHERE x = 42 ORDER BY x", but it allows us to
recognize when particular index columns are irrelevant to the sort order:
if we have "... WHERE x = 42 ORDER BY y", scanning an index on (x,y)
produces correctly ordered data without a sort step.  We used to have very
ugly ad-hoc code to recognize that in limited contexts, but discarding
constant ECs from pathkeys makes it happen cleanly and automatically.

You might object that a below-outer-join EquivalenceClass doesn't always
represent the same values at every level of the join tree, and so using
it to uniquely identify a sort order is dubious.  This is true, but we
can avoid dealing with the fact explicitly because we always consider that
an outer join destroys any ordering of its nullable inputs.  Thus, even
if a path was sorted by {a.x} below an outer join, we'll re-sort if that
sort ordering was important; and so using the same PathKey for both sort
orderings doesn't create any real problem.



Though Bob Devine <bob.devine@@worldnet.att.net> was not involved in the 
coding of our optimizer, he is available to field questions about
optimizer topics.

-- bjm & tgl
@


1.53
log
@Rework join-removal logic as per recent discussion.  In particular this
fixes things so that it works for cases where nested removals are possible.
The overhead of the optimization should be significantly less, as well.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/optimizer/README,v 1.52 2009/09/29 01:20:34 tgl Exp $
a639 1

@


1.52
log
@Fix equivclass.c's not-quite-right strategy for handling X=X clauses.

The original coding correctly noted that these aren't just redundancies
(they're effectively X IS NOT NULL, assuming = is strict).  However, they
got treated that way if X happened to be in a single-member EquivalenceClass
already, which could happen if there was an ORDER BY X clause, for instance.
The simplest and most reliable solution seems to be to not try to process
such clauses through the EquivalenceClass machinery; just throw them back
for traditional processing.  The amount of work that'd be needed to be
smarter than that seems out of proportion to the benefit.

Per bug #5084 from Bernt Marius Johnsen, and analysis by Andrew Gierth.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/optimizer/README,v 1.51 2009/09/17 20:49:28 tgl Exp $
a356 1
  NoOpPath      - same as its input path (used when a join is removed)
@


1.51
log
@Implement "join removal" for cases where the inner side of a left join
is unique and is not referenced above the join.  In this case the inner
side doesn't affect the query result and can be thrown away entirely.
Although perhaps nobody would ever write such a thing by hand, it's
a reasonably common case in machine-generated SQL.

The current implementation only recognizes the case where the inner side
is a simple relation with a unique index matching the query conditions.
This is enough for the use-cases that have been shown so far, but we
might want to try to handle other cases later.

Robert Haas, somewhat rewritten by Tom
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/optimizer/README,v 1.50 2009/07/21 02:02:44 tgl Exp $
d484 4
a487 1
EquivalenceClass will never be merged with any other.
@


1.50
log
@Fix another semijoin-ordering bug.  We already knew that we couldn't
reorder a semijoin into or out of the righthand side of another semijoin,
but actually it doesn't work to reorder it into or out of the righthand
side of a left or antijoin, either.  Per bug #4906 from Mathieu Fenniak.

This was sloppy thinking on my part.  This identity does work:

	( A left join B on (Pab) ) semijoin C on (Pac)
==
	( A semijoin C on (Pac) ) left join B on (Pab)

but I failed to see that that doesn't mean this does:

	( A left join B on (Pab) ) semijoin C on (Pbc)
!=
	A left join ( B semijoin C on (Pbc) ) on (Pab)
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/optimizer/README,v 1.49 2009/02/27 22:41:37 tgl Exp $
d357 1
@


1.49
log
@Tighten up join ordering rules to account for recent more-careful analysis
of the associativity of antijoins.  Also improve optimizer/README discussion
of outer join ordering rules.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/optimizer/README,v 1.48 2008/08/14 18:47:59 tgl Exp $
d217 4
a220 4
into or out of the lefthand side of another semijoin, but not into or out
of the righthand side.  Likewise, an inner join, left join, or antijoin
can be reassociated into or out of the lefthand side of a semijoin, but
not into or out of the righthand side.
@


1.49.2.1
log
@Fix another semijoin-ordering bug.  We already knew that we couldn't
reorder a semijoin into or out of the righthand side of another semijoin,
but actually it doesn't work to reorder it into or out of the righthand
side of a left or antijoin, either.  Per bug #4906 from Mathieu Fenniak.

This was sloppy thinking on my part.  This identity does work:

	( A left join B on (Pab) ) semijoin C on (Pac)
==
	( A semijoin C on (Pac) ) left join B on (Pab)

but I failed to see that that doesn't mean this does:

	( A left join B on (Pab) ) semijoin C on (Pbc)
!=
	A left join ( B semijoin C on (Pbc) ) on (Pab)
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/optimizer/README,v 1.49 2009/02/27 22:41:37 tgl Exp $
d217 4
a220 4
into or out of the lefthand side of another semijoin, left join, or
antijoin, but not into or out of the righthand side.  Likewise, an inner
join, left join, or antijoin can be reassociated into or out of the
lefthand side of a semijoin, but not into or out of the righthand side.
@


1.49.2.2
log
@Fix equivclass.c's not-quite-right strategy for handling X=X clauses.

The original coding correctly noted that these aren't just redundancies
(they're effectively X IS NOT NULL, assuming = is strict).  However, they
got treated that way if X happened to be in a single-member EquivalenceClass
already, which could happen if there was an ORDER BY X clause, for instance.
The simplest and most reliable solution seems to be to not try to process
such clauses through the EquivalenceClass machinery; just throw them back
for traditional processing.  The amount of work that'd be needed to be
smarter than that seems out of proportion to the benefit.

Per bug #5084 from Bernt Marius Johnsen, and analysis by Andrew Gierth.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/optimizer/README,v 1.49.2.1 2009/07/21 02:02:51 tgl Exp $
d483 1
a483 4
EquivalenceClass will never be merged with any other.  Note in particular
that a single-item EquivalenceClass {a.x} is *not* meant to imply an
assertion that a.x = a.x; the practical effect of this is that a.x could
be NULL.
@


1.48
log
@Implement SEMI and ANTI joins in the planner and executor.  (Semijoins replace
the old JOIN_IN code, but antijoins are new functionality.)  Teach the planner
to convert appropriate EXISTS and NOT EXISTS subqueries into semi and anti
joins respectively.  Also, LEFT JOINs with suitable upper-level IS NULL
filters are recognized as being anti joins.  Unify the InClauseInfo and
OuterJoinInfo infrastructure into "SpecialJoinInfo".  With that change,
it becomes possible to associate a SpecialJoinInfo with every join attempt,
which permits some cleanup of join selectivity estimation.  That needs to be
taken much further than this patch does, but the next step is to change the
API for oprjoin selectivity functions, which seems like material for a
separate patch.  So for the moment the output size estimates for semi and
especially anti joins are quite bogus.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/optimizer/README,v 1.47 2008/08/02 21:31:59 tgl Exp $
d208 1
a208 2
tables, so the same identities work for right joins.  Only FULL JOIN
cannot be re-ordered at all.
d216 13
d230 1
a230 1
translating the jointree to "joinlist" representation.  LEFT and RIGHT
d233 1
a233 1
creates a SpecialJoinInfo node for each outer join, and join_is_legal
d241 1
a241 1
OJ, but in general they can't be associated into the righthand side.
@


1.47
log
@Rearrange the querytree representation of ORDER BY/GROUP BY/DISTINCT items
as per my recent proposal:

1. Fold SortClause and GroupClause into a single node type SortGroupClause.
We were already relying on them to be struct-equivalent, so using two node
tags wasn't accomplishing much except to get in the way of comparing items
with equal().

2. Add an "eqop" field to SortGroupClause to carry the associated equality
operator.  This is cheap for the parser to get at the same time it's looking
up the sort operator, and storing it eliminates the need for repeated
not-so-cheap lookups during planning.  In future this will also let us
represent GROUP/DISTINCT operations on datatypes that have hash opclasses
but no btree opclasses (ie, they have equality but no natural sort order).
The previous representation simply didn't work for that, since its only
indicator of comparison semantics was a sort operator.

3. Add a hasDistinctOn boolean to struct Query to explicitly record whether
the distinctClause came from DISTINCT or DISTINCT ON.  This allows removing
some complicated and not 100% bulletproof code that attempted to figure
that out from the distinctClause alone.

This patch doesn't in itself create any new capability, but it's necessary
infrastructure for future attempts to use hash-based grouping for DISTINCT
and UNION/INTERSECT/EXCEPT.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/optimizer/README,v 1.46 2008/04/09 01:00:46 momjian Exp $
d117 2
a118 3
IN clauses are treated as if they were real join clauses, to ensure that
we find a workable join order in cases where those restrictions force a
clauseless join to be done.)
d176 1
a176 1
"IN (sub-select)" WHERE clauses that were converted to joins, then some of
d178 1
a178 1
join_is_legal consult side lists of outer joins and IN joins to see
d221 1
a221 1
creates an OuterJoinInfo node for each outer join, and join_is_legal
d224 1
a224 1
What we store in OuterJoinInfo nodes are the minimum sets of Relids
d275 1
a275 1
 pull up subqueries from rangetable, if possible
@


1.46
log
@Small wording improvements for source code READMEs.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/optimizer/README,v 1.45 2008/04/09 00:59:24 momjian Exp $
d566 2
a567 2
path.  So, SortClause lists are turned into pathkeys lists for use inside
the optimizer.
@


1.45
log
@Revert README cleanups.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/optimizer/README,v 1.43 2008/03/21 13:23:28 momjian Exp $
d76 2
a77 2
including sequential and index scans, and make a Path representing that
way.  All the Paths made for a given relation are placed in its
d274 1
a274 1
 do final cleanup after planning.
@


1.44
log
@Revert sentence removal from nickname in FAQ.
@
text
@d76 2
a77 2
including sequential and index scans, and make Paths representing those
ways.  All the Paths made for a given relation are placed in its
d274 1
a274 1
 do final cleanup after planning
@


1.43
log
@More README src cleanups.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/optimizer/README,v 1.42 2008/03/20 17:55:14 momjian Exp $
d76 2
a77 2
including sequential and index scans, and make a Path representing that
way.  All the Paths made for a given relation are placed in its
d274 1
a274 1
 do final cleanup after planning.
@


1.42
log
@Make source code READMEs more consistent.  Add CVS tags to all README files.
@
text
@d1 1
a1 1
$PostgreSQL: pgsql/src/backend/access/gist/README,v 1.3 2005/09/16 14:40:54 teodor Exp $
d4 1
a4 1
---------
@


1.41
log
@Change have_join_order_restriction() so that we do not force a clauseless join
if either of the input relations can legally be joined to any other rels using
join clauses.  This avoids uselessly (and expensively) considering a lot of
really stupid join paths when there is a join restriction with a large
footprint, that is, lots of relations inside its LHS or RHS.  My patch of
15-Feb-2007 had been causing the code to consider joining *every* combination
of rels inside such a group, which is exponentially bad :-(.  With this
behavior, clauseless bushy joins will be done if necessary, but they'll be
put off as long as possible.  Per report from Jakub Ouhrabka.

Backpatch to 8.2.  We might someday want to backpatch to 8.1 as well, but 8.1
does not have the problem for OUTER JOIN nests, only for IN-clauses, so it's
not clear anyone's very likely to hit it in practice; and the current patch
doesn't apply cleanly to 8.1.
@
text
@d1 4
a4 2
Summary
-------
d185 1
a185 1
Valid OUTER JOIN optimizations
d241 1
a241 1
Pulling up subqueries
@


1.41.2.1
log
@Fix equivclass.c's not-quite-right strategy for handling X=X clauses.

The original coding correctly noted that these aren't just redundancies
(they're effectively X IS NOT NULL, assuming = is strict).  However, they
got treated that way if X happened to be in a single-member EquivalenceClass
already, which could happen if there was an ORDER BY X clause, for instance.
The simplest and most reliable solution seems to be to not try to process
such clauses through the EquivalenceClass machinery; just throw them back
for traditional processing.  The amount of work that'd be needed to be
smarter than that seems out of proportion to the benefit.

Per bug #5084 from Bernt Marius Johnsen, and analysis by Andrew Gierth.
@
text
@d470 1
a470 4
EquivalenceClass will never be merged with any other.  Note in particular
that a single-item EquivalenceClass {a.x} is *not* meant to imply an
assertion that a.x = a.x; the practical effect of this is that a.x could
be NULL.
@


1.40
log
@Create a function variable "join_search_hook" to let plugins override the
join search order portion of the planner; this is specifically intended to
simplify developing a replacement for GEQO planning.  Patch by Julius
Stroffek, editorialized on by me.  I renamed make_one_rel_by_joins to
standard_join_search and make_rels_by_joins to join_search_one_level to better
reflect their place within this scheme.
@
text
@d177 1
a177 1
make_join_rel consult side lists of outer joins and IN joins to see
d220 1
a220 1
creates an OuterJoinInfo node for each outer join, and make_join_rel
d229 1
a229 1
So the restriction enforced by make_join_rel is that a proposed join
@


1.39
log
@Get rid of some old and crufty global variables in the planner.  When
this code was last gone over, there wasn't really any alternative to
globals because we didn't have the PlannerInfo struct being passed all
through the planner code.  Now that we do, we can restructure things
to avoid non-reentrancy.  I'm fooling with this because otherwise I'd
have had to add another global variable for the planned compact
range table list.
@
text
@d295 1
a295 1
      find scan and all index paths for each base relation
d297 5
a301 4
-----make_one_rel_by_joins()
      jump to geqo if needed
      else call make_rels_by_joins() for each level of join tree needed
      make_rels_by_joins():
d305 1
a305 1
      Back at make_one_rel_by_joins(), apply set_cheapest() to extract the
@


1.38
log
@Restructure code that is responsible for ensuring that clauseless joins are
considered when it is necessary to do so because of a join-order restriction
(that is, an outer-join or IN-subselect construct).  The former coding was a
bit ad-hoc and inconsistent, and it missed some cases, as exposed by Mario
Weilguni's recent bug report.  His specific problem was that an IN could be
turned into a "clauseless" join due to constant-propagation removing the IN's
joinclause, and if the IN's subselect involved more than one relation and
there was more than one such IN linking to the same upper relation, then the
only valid join orders involve "bushy" plans but we would fail to consider the
specific paths needed to get there.  (See the example case added to the join
regression test.)  On examining the code I wonder if there weren't some other
problem cases too; in particular it seems that GEQO was defending against a
different set of corner cases than the main planner was.  There was also an
efficiency problem, in that when we did realize we needed a clauseless join
because of an IN, we'd consider clauseless joins against every other relation
whether this was sensible or not.  It seems a better design is to use the
outer-join and in-clause lists as a backup heuristic, just as the rule of
joining only where there are joinclauses is a heuristic: we'll join two
relations if they have a usable joinclause *or* this might be necessary to
satisfy an outer-join or IN-clause join order restriction.  I refactored the
code to have just one place considering this instead of three, and made sure
that it covered all the cases that any of them had been considering.

Backpatch as far as 8.1 (which has only the IN-clause form of the disease).
By rights 8.0 and 7.4 should have the bug too, but they accidentally fail
to fail, because the joininfo structure used in those releases preserves some
memory of there having once been a joinclause between the inner and outer
sides of an IN, and so it leads the code in the right direction anyway.
I'll be conservative and not touch them.
@
text
@d320 4
a323 1
PlannerInfo     - global information for planning a particular Query
@


1.37
log
@Repair bug in 8.2's new logic for planning outer joins: we have to allow joins
that overlap an outer join's min_righthand but aren't fully contained in it,
to support joining within the RHS after having performed an outer join that
can commute with this one.  Aside from the direct fix in make_join_rel(),
fix has_join_restriction() and GEQO's desirable_join() to consider this
possibility.  Per report from Ian Harding.
@
text
@d108 10
a117 6
Consider joining each RelOptInfo to each other RelOptInfo specified in its
RelOptInfo.joininfo, and generate a Path for each possible join method for
each such pair.  (If we have a RelOptInfo with no join clauses, we have no
choice but to generate a clauseless Cartesian-product join; so we consider
joining that rel to each other available rel.  But in the presence of join
clauses we will only consider joins that use available join clauses.)
@


1.36
log
@Refactor planner's pathkeys data structure to create a separate, explicit
representation of equivalence classes of variables.  This is an extensive
rewrite, but it brings a number of benefits:
* planner no longer fails in the presence of "incomplete" operator families
that don't offer operators for every possible combination of datatypes.
* avoid generating and then discarding redundant equality clauses.
* remove bogus assumption that derived equalities always use operators
named "=".
* mergejoins can work with a variety of sort orders (e.g., descending) now,
instead of tying each mergejoinable operator to exactly one sort order.
* better recognition of redundant sort columns.
* can make use of equalities appearing underneath an outer join.
@
text
@d226 2
a227 2
can't join across a RHS boundary (ie, join anything inside the RHS
to anything else) unless the join validly implements some outer join.
@


1.35
log
@Revise the planner's handling of "pseudoconstant" WHERE clauses, that is
clauses containing no variables and no volatile functions.  Such a clause
can be used as a one-time qual in a gating Result plan node, to suppress
plan execution entirely when it is false.  Even when the clause is true,
putting it in a gating node wins by avoiding repeated evaluation of the
clause.  In previous PG releases, query_planner() would do this for
pseudoconstant clauses appearing at the top level of the jointree, but
there was no ability to generate a gating Result deeper in the plan tree.
To fix it, get rid of the special case in query_planner(), and instead
process pseudoconstant clauses through the normal RestrictInfo qual
distribution mechanism.  When a pseudoconstant clause is found attached to
a path node in create_plan(), pull it out and generate a gating Result at
that point.  This requires special-casing pseudoconstants in selectivity
estimation and cost_qual_eval, but on the whole it's pretty clean.
It probably even makes the planner a bit faster than before for the normal
case of no pseudoconstants, since removing pull_constant_clauses saves one
useless traversal of the qual tree.  Per gripe from Phil Frost.
@
text
@d93 13
a105 15
both lists of relations to be joined in any order, and JOIN nodes that
force a particular join order.  For each un-flattened JOIN node, we join
exactly that pair of relations (after recursively planning their inputs,
if the inputs aren't single base relations).  We generate a Path for each
feasible join method, and select the cheapest Path.  Note that the JOIN
clause structure determines the join Path structure, but it doesn't
constrain the join implementation method at each join (nestloop, merge,
hash), nor does it say which rel is considered outer or inner at each
join.  We consider all these possibilities in building Paths.

3) At the top level of the FROM clause we will have a list of relations
that are either base rels or joinrels constructed per un-flattened JOIN
directives.  (This is also the situation, recursively, when we can flatten
sub-joins underneath an un-flattenable JOIN into a list of relations to
join.)  We can join these rels together in any order the planner sees fit.
d115 1
a115 1
If we only had two relations in the FROM list, we are done: we just pick
d118 1
a118 1
join RelOptInfos that represent more than two FROM items.
d122 2
a123 2
representing exactly two FROM items.  The second pass considers ways
to make join rels that represent exactly three FROM items; the next pass,
d125 1
a125 1
relation that includes all FROM items --- obviously there can be only one
d156 1
a156 1
but the inner is always a single FROM item); right-handed plans (outer rel
d337 3
a339 1
 PathKeys       - a data structure representing the ordering of a path
d366 108
d478 14
a491 5
of a particular Path.

Path.pathkeys is a List of Lists of PathKeyItem nodes that represent
the sort order of the result generated by the Path.  The n'th sublist
represents the n'th sort key of the result.
d497 12
a508 12
if any.  A single-key index would create a pathkey with a single sublist,
e.g. ( (tab1.indexkey1/sortop1) ).  A multi-key index generates a sublist
per key, e.g. ( (tab1.indexkey1/sortop1) (tab1.indexkey2/sortop2) ) which
shows major sort by indexkey1 (ordering by sortop1) and minor sort by
indexkey2 with sortop2.

Note that a multi-pass indexscan (OR clause scan) has NIL pathkeys since
we can say nothing about the overall order of its result.  Also, an
indexscan on an unordered type of index generates NIL pathkeys.  However,
we can always create a pathkey by doing an explicit sort.  The pathkeys
for a Sort plan's output just represent the sort key fields and the
ordering operators used.
d512 12
a523 29
of the mergejoin is sorted by X --- but it is also sorted by Y.  We
represent this fact by listing both keys in a single pathkey sublist:
( (A.X/xsortop B.Y/ysortop) ).  This pathkey asserts that the major
sort order of the Path can be taken to be *either* A.X or B.Y.
They are equal, so they are both primary sort keys.  By doing this,
we allow future joins to use either var as a pre-sorted key, so upper
Mergejoins may be able to avoid having to re-sort the Path.  This is
why pathkeys is a List of Lists.

We keep a sortop associated with each PathKeyItem because cross-data-type
mergejoins are possible; for example int4 = int8 is mergejoinable.
In this case we need to remember that the left var is ordered by int4lt
while the right var is ordered by int8lt.  So the different members of
each sublist could have different sortops.

Note that while the order of the top list is meaningful (primary vs.
secondary sort key), the order of each sublist is arbitrary.  Each sublist
should be regarded as a set of equivalent keys, with no significance
to the list order.

With a little further thought, it becomes apparent that pathkeys for
joins need not only come from mergejoins.  For example, if we do a
nestloop join between outer relation A and inner relation B, then any
pathkeys relevant to A are still valid for the join result: we have
not altered the order of the tuples from A.  Even more interesting,
if there was a mergeclause (more formally, an "equijoin clause") A.X=B.Y,
and A.X was a pathkey for the outer relation A, then we can assert that
B.Y is a pathkey for the join result; X was ordered before and still is,
and the joined values of Y are equal to the joined values of X, so Y
d525 28
a552 33
explicit sort nor a mergejoin on Y.

More generally, whenever we have an equijoin clause A.X = B.Y and a
pathkey A.X, we can add B.Y to that pathkey if B is part of the joined
relation the pathkey is for, *no matter how we formed the join*.  It works
as long as the clause has been applied at some point while forming the
join relation.  (In the current implementation, we always apply qual
clauses as soon as possible, ie, as far down in the plan tree as possible.
So we can treat the pathkeys as equivalent everywhere.  The exception is
when the relations A and B are joined inside the nullable side of an
OUTER JOIN and the equijoin clause comes from above the OUTER JOIN.  In this
case we cannot apply the qual as soon as A and B are joined, so we do not
consider the pathkeys to be equivalent.  This could be improved if we wanted
to go to the trouble of making pathkey equivalence be context-dependent,
but that seems much more complex than it's worth.)

In short, then: when producing the pathkeys for a merge or nestloop join,
we can keep all of the keys of the outer path, since the ordering of the
outer path will be preserved in the result.  Furthermore, we can add to
each pathkey sublist any inner vars that are equijoined to any of the
outer vars in the sublist; this works regardless of whether we are
implementing the join using that equijoin clause as a mergeclause,
or merely enforcing the clause after-the-fact as a qpqual filter.

Although Hashjoins also work only with equijoin operators, it is *not*
safe to consider the output of a Hashjoin to be sorted in any particular
order --- not even the outer path's order.  This is true because the
executor might have to split the join into multiple batches.  Therefore
a Hashjoin is always given NIL pathkeys.  (Also, we need to use only
mergejoinable operators when deducing which inner vars are now sorted,
because a mergejoin operator tells us which left- and right-datatype
sortops can be considered equivalent, whereas a hashjoin operator
doesn't imply anything about sort order.)
d559 49
a607 1
OK, now for how it *really* works:
a608 96
We did implement pathkeys just as described above, and found that the
planner spent a huge amount of time comparing pathkeys, because the
representation of pathkeys as unordered lists made it expensive to decide
whether two were equal or not.  So, we've modified the representation
as described next.

If we scan the WHERE clause for equijoin clauses (mergejoinable clauses)
during planner startup, we can construct lists of equivalent pathkey items
for the query.  There could be more than two items per equivalence set;
for example, WHERE A.X = B.Y AND B.Y = C.Z AND D.R = E.S creates the
equivalence sets { A.X B.Y C.Z } and { D.R E.S } (plus associated sortops).
Any pathkey item that belongs to an equivalence set implies that all the
other items in its set apply to the relation too, or at least all the ones
that are for fields present in the relation.  (Some of the items in the
set might be for as-yet-unjoined relations.)  Furthermore, any multi-item
pathkey sublist that appears at any stage of planning the query *must* be
a subset of one or another of these equivalence sets; there's no way we'd
have put two items in the same pathkey sublist unless they were equijoined
in WHERE.

Now suppose that we allow a pathkey sublist to contain pathkey items for
vars that are not yet part of the pathkey's relation.  This introduces
no logical difficulty, because such items can easily be seen to be
irrelevant; we just mandate that they be ignored.  But having allowed
this, we can declare (by fiat) that any multiple-item pathkey sublist
must be "equal()" to the appropriate equivalence set.  In effect,
whenever we make a pathkey sublist that mentions any var appearing in an
equivalence set, we instantly add all the other vars equivalenced to it,
whether they appear yet in the pathkey's relation or not.  And we also
mandate that the pathkey sublist appear in the same order as the
equivalence set it comes from.

In fact, we can go even further, and say that the canonical representation
of a pathkey sublist is a pointer directly to the relevant equivalence set,
which is kept in a list of pathkey equivalence sets for the query.  Then
pathkey sublist comparison reduces to pointer-equality checking!  To do this
we also have to add single-element pathkey sublists to the query's list of
equivalence sets, but that's a small price to pay.

By the way, it's OK and even useful for us to build equivalence sets
that mention multiple vars from the same relation.  For example, if
we have WHERE A.X = A.Y and we are scanning A using an index on X,
we can legitimately conclude that the path is sorted by Y as well;
and this could be handy if Y is the variable used in other join clauses
or ORDER BY.  So, any WHERE clause with a mergejoinable operator can
contribute to an equivalence set, even if it's not a join clause.

As sketched so far, equijoin operators allow us to conclude that
A.X = B.Y and B.Y = C.Z together imply A.X = C.Z, even when different
datatypes are involved.  What is not immediately obvious is that to use
the "canonical pathkey" representation, we *must* make this deduction.
An example (from a real bug in Postgres 7.0) is a mergejoin for a query
like
	SELECT * FROM t1, t2 WHERE t1.f2 = t2.f3 AND t1.f1 = t2.f3;
The canonical-pathkey mechanism is able to deduce that t1.f1 = t1.f2
(ie, both appear in the same canonical pathkey set).  If we sort t1
and then apply a mergejoin, we *must* filter the t1 tuples using the
implied qualification f1 = f2, because otherwise the output of the sort
will be ordered by f1 or f2 (whichever we sort on) but not both.  The
merge will then fail since (depending on which qual clause it applies
first) it's expecting either ORDER BY f1,f2 or ORDER BY f2,f1, but the
actual output of the sort has neither of these orderings.  The best fix
for this is to generate all the implied equality constraints for each
equijoin set and add these clauses to the query's qualification list.
In other words, we *explicitly* deduce f1 = f2 and add this to the WHERE
clause.  The constraint will be applied as a qpqual to the output of the
scan on t1, resulting in sort output that is indeed ordered by both vars.
This approach provides more information to the selectivity estimation
code than it would otherwise have, and reduces the number of tuples
processed in join stages, so it's a win to make these deductions even
if we weren't forced to.

When we generate implied equality constraints, we may find ourselves
adding redundant clauses to specific relations.  For example, consider
	SELECT * FROM t1, t2, t3 WHERE t1.a = t2.b AND t2.b = t3.c;
We will generate the implied clause t1.a = t3.c and add it to the tree.
This is good since it allows us to consider joining t1 and t3 directly,
which we otherwise wouldn't do.  But when we reach the stage of joining
all three relations, we will have redundant join clauses --- eg, if we
join t1 and t2 first, then the path that joins (t1 t2) to t3 will have
both t2.b = t3.c and t1.a = t3.c as restriction clauses.  This is bad;
not only is evaluation of the extra clause useless work at runtime,
but the selectivity estimator routines will underestimate the number
of tuples produced since they won't know that the two clauses are
perfectly redundant.  We fix this by detecting and removing redundant
clauses as the restriction clause list is built for each join.  (We
can't do it sooner, since which clauses are redundant will vary depending
on the join order.)

Yet another implication of all this is that mergejoinable operators
must form closed equivalence sets.  For example, if "int2 = int4"
and "int4 = int8" are both marked mergejoinable, then there had better
be a mergejoinable "int2 = int8" operator as well.  Otherwise, when
we're given WHERE int2var = int4var AND int4var = int8var, we'll fail
while trying to create a representation of the implied clause
int2var = int8var.
a609 10
An additional refinement we can make is to insist that canonical pathkey
lists (sort orderings) do not mention the same pathkey set more than once.
For example, a pathkey list ((A) (B) (A)) is redundant --- the second
occurrence of (A) does not change the ordering, since the data must already
be sorted by A.  Although a user probably wouldn't write ORDER BY A,B,A
directly, such redundancies are more probable once equijoin equivalences
have been considered.  Also, the system is likely to generate redundant
pathkey lists when computing the sort ordering needed for a mergejoin.  By
eliminating the redundancy, we save time and improve planning, since the
planner will more easily recognize equivalent orderings as being equivalent.
@


1.35.2.1
log
@Repair bug in 8.2's new logic for planning outer joins: we have to allow joins
that overlap an outer join's min_righthand but aren't fully contained in it,
to support joining within the RHS after having performed an outer join that
can commute with this one.  Aside from the direct fix in make_join_rel(),
fix has_join_restriction() and GEQO's desirable_join() to consider this
possibility.  Per report from Ian Harding.
@
text
@d228 2
a229 2
can't join a rel within or partly within an RHS boundary to one outside
the boundary, unless the join validly implements some outer join.
@


1.35.2.2
log
@Change have_join_order_restriction() so that we do not force a clauseless join
if either of the input relations can legally be joined to any other rels using
join clauses.  This avoids uselessly (and expensively) considering a lot of
really stupid join paths when there is a join restriction with a large
footprint, that is, lots of relations inside its LHS or RHS.  My patch of
15-Feb-2007 had been causing the code to consider joining *every* combination
of rels inside such a group, which is exponentially bad :-(.  With this
behavior, clauseless bushy joins will be done if necessary, but they'll be
put off as long as possible.  Per report from Jakub Ouhrabka.

Backpatch to 8.2.  We might someday want to backpatch to 8.1 as well, but 8.1
does not have the problem for OUTER JOIN nests, only for IN-clauses, so it's
not clear anyone's very likely to hit it in practice; and the current patch
doesn't apply cleanly to 8.1.
@
text
@d175 1
a175 1
join_is_legal consult side lists of outer joins and IN joins to see
d218 1
a218 1
creates an OuterJoinInfo node for each outer join, and join_is_legal
d227 1
a227 1
So the restriction enforced by join_is_legal is that a proposed join
@


1.34
log
@Teach planner how to rearrange join order for some classes of OUTER JOIN.
Per my recent proposal.  I ended up basing the implementation on the
existing mechanism for enforcing valid join orders of IN joins --- the
rules for valid outer-join orders are somewhat similar.
@
text
@d332 1
a332 1
  ResultPath    - a Result plan node (used for variable-free tlist or qual)
@


1.33
log
@Simplify the planner's join clause management by storing join clauses
of a relation in a flat 'joininfo' list.  The former arrangement grouped
the join clauses according to the set of unjoined relids used in each;
however, profiling on test cases involving lots of joins proves that
that data structure is a net loss.  It takes more time to group the
join clauses together than is saved by avoiding duplicate tests later.
It doesn't help any that there are usually not more than one or two
clauses per group ...
@
text
@d43 5
a47 4
scan, plus index scans for any indexes that exist on the table.  A subquery
base relation just has one Path, a "SubqueryScan" path (which links to the
subplan that was built by a recursive invocation of the planner).  Likewise
a function-RTE base relation has only one possible Path.
d88 14
a101 10
2) If the query's FROM clause contains explicit JOIN clauses, we join
those pairs of relations in exactly the tree structure indicated by the
JOIN clauses.  (This is absolutely necessary when dealing with outer JOINs.
For inner JOINs we have more flexibility in theory, but don't currently
exploit it in practice.)  For each such join pair, we generate a Path
for each feasible join method, and select the cheapest Path.  Note that
the JOIN clause structure determines the join Path structure, but it
doesn't constrain the join implementation method at each join (nestloop,
merge, hash), nor does it say which rel is considered outer or inner at
each join.  We consider all these possibilities in building Paths.
d104 4
a107 2
that are either base rels or joinrels constructed per JOIN directives.
We can join these rels together in any order the planner sees fit.
d166 1
a166 1
that produce equivalent joinrels will compete in add_path.
d172 64
d254 7
a260 7
FROM-list (with no explicit JOIN directives between), then we can merge the
two FROM-lists together.  Once that's done, the subquery is an absolutely
integral part of the outer query and will not constrain the join tree
search space at all.  However, that could result in unpleasant growth of
planning time, since the dynamic-programming search has runtime exponential
in the number of FROM-items considered.  Therefore, we don't merge
FROM-lists if the result would have too many FROM-items in one list.
@


1.32
log
@Remove planner's private fields from Query struct, and put them into
a new PlannerInfo struct, which is passed around instead of the bare
Query in all the planning code.  This commit is essentially just a
code-beautification exercise, but it does open the door to making
larger changes to the planner data structures without having to muck
with the widely-known Query struct.
@
text
@d78 4
a81 5
useful sort ordering of the relation.)  Also create RelOptInfo.joininfo
nodes that list all the join clauses that involve this relation.  For
example, the WHERE clause "tab1.col1 = tab2.col1" generates a JoinInfo
for tab1 listing tab2 as an unjoined relation, and also one for tab2
showing tab1 as an unjoined relation.
a253 1
 JoinInfo       - join clauses associated with a particular pair of relations
@


1.31
log
@Rethink original decision to use AND/OR Expr nodes to represent bitmap
logic operations during planning.  Seems cleaner to create two new Path
node types, instead --- this avoids duplication of cost-estimation code.
Also, create an enable_bitmapscan GUC parameter to control use of bitmap
plans.
@
text
@d248 2
@


1.30
log
@Instead of trying to force WHERE clauses into CNF or DNF normal form,
just look for common clauses that can be pulled out of ORs.  Per recent
discussion, extracting common clauses seems to be the only really useful
effect of normalization, and if we do it explicitly then we can avoid
cluttering the qual with partially-redundant duplicated expressions, which
was an unpleasant side-effect of the old approach.
@
text
@d258 1
@


1.29
log
@IN clauses appearing at top level of WHERE can now be handled as joins.
There are two implementation techniques: the executor understands a new
JOIN_IN jointype, which emits at most one matching row per left-hand row,
or the result of the IN's sub-select can be fed through a DISTINCT filter
and then joined as an ordinary relation.
Along the way, some minor code cleanup in the optimizer; notably, break
out most of the jointree-rearrangement preprocessing in planner.c and
put it in a new file prep/prepjointree.c.
@
text
@d203 4
a207 8
 canonicalize qual
     Attempt to reduce WHERE clause to either CNF or DNF canonical form.
     CNF (top-level-AND) is preferred, since the optimizer can then use
     any of the AND subclauses to filter tuples; but quals that are in
     or close to DNF form will suffer exponential expansion if we try to
     force them to CNF.  In pathological cases either transform may expand
     the qual unreasonably; so we may have to leave it un-normalized,
     thereby reducing the accuracy of selectivity estimates.
@


1.28
log
@Allow merge and hash joins to occur on arbitrary expressions (anything not
containing a volatile function), rather than only on 'Var = Var' clauses
as before.  This makes it practical to do flatten_join_alias_vars at the
start of planning, which in turn eliminates a bunch of klugery inside the
planner to deal with alias vars.  As a free side effect, we now detect
implied equality of non-Var expressions; for example in
	SELECT ... WHERE a.x = b.y and b.y = 42
we will deduce a.x = 42 and use that as a restriction qual on a.  Also,
we can remove the restriction introduced 12/5/02 to prevent pullup of
subqueries whose targetlists contain sublinks.
Still TODO: make statistical estimation routines in selfuncs.c and costsize.c
smarter about expressions that are more complex than plain Vars.  The need
for this is considerably greater now that we have to be able to estimate
the suitability of merge and hash join techniques on such expressions.
@
text
@d266 1
@


1.27
log
@Be more realistic about plans involving Materialize nodes: take their
cost into account while planning.
@
text
@d254 4
a257 2
 RestrictInfo   - restriction clauses, like "x = 3"
 JoinInfo       - join clauses, including the relids needed for the join
@


1.26
log
@First phase of implementing hash-based grouping/aggregation.  An AGG plan
node now does its own grouping of the input rows, and has no need for a
preceding GROUP node in the plan pipeline.  This allows elimination of
the misnamed tuplePerGroup option for GROUP, and actually saves more code
in nodeGroup.c than it costs in nodeAgg.c, as well as being presumably
faster.  Restructure the API of query_planner so that we do not commit to
using a sorted or unsorted plan in query_planner; instead grouping_planner
makes the decision.  (Right now it isn't any smarter than query_planner
was, but that will change as soon as it has the option to select a hash-
based aggregation step.)  Despite all the hackery, no initdb needed since
only in-memory node types changed.
@
text
@d262 2
a263 1
  ResultPath    - a Result plan (used for variable-free tlist or qual)
@


1.25
log
@Add Bob Devine's name to the optimizer README.
@
text
@d222 3
a224 5
   make a simplified target list that only contains Vars, no expressions
---subplanner()
    make list of base relations used in query
    split up the qual into restrictions (a=1) and joins (b=c)
    find qual clauses that enable merge and hash joins
d240 1
a240 1
   put back constant quals and non-simplified target list
d258 1
a258 1
  SeqScan       - a plain Path node with nodeTag = T_SeqScan
d260 3
d280 1
a280 1
sort ordering is of interest even at the top level.  subplanner() will
d282 2
a283 2
and will compare its cost to the cost of using the cheapest-overall path
and doing an explicit sort.
@


1.24
log
@Get rid of long-since-vestigial Iter node type, in favor of adding a
returns-set boolean field in Func and Oper nodes.  This allows cleaner,
more reliable tests for expressions returning sets in the planner and
parser.  For example, a WHERE clause returning a set is now detected
and complained of in the parser, not only at runtime.
@
text
@d501 4
d506 1
@


1.23
log
@Extend code that deduces implied equality clauses to detect whether a
clause being added to a particular restriction-clause list is redundant
with those already in the list.  This avoids useless work at runtime,
and (perhaps more importantly) keeps the selectivity estimation routines
from generating too-small estimates of numbers of output rows.
Also some minor improvements in OPTIMIZER_DEBUG displays.
@
text
@d45 2
a46 1
subplan that was built by a recursive invocation of the planner).
@


1.22
log
@Move structure comments from the top block down to the line entries for
this file to match all the other files, and to be clearer.
@
text
@d464 17
@


1.21
log
@Planner speedup hacking.  Avoid saving useless pathkeys, so that path
comparison does not consider paths different when they differ only in
uninteresting aspects of sort order.  (We had a special case of this
consideration for indexscans already, but generalize it to apply to
ordered join paths too.)  Be stricter about what is a canonical pathkey
to allow faster pathkey comparison.  Cache canonical pathkeys and
dispersion stats for left and right sides of a RestrictInfo's clause,
to avoid repeated computation.  Total speedup will depend on number of
tables in a query, but I see about 4x speedup of planning phase for
a sample seven-table query.
@
text
@d83 1
a83 1
If we have only a single base relation in the query, we are done now.
@


1.20
log
@Restructure handling of inheritance queries so that they work with outer
joins, and clean things up a good deal at the same time.  Append plan node
no longer hacks on rangetable at runtime --- instead, all child tables are
given their own RT entries during planning.  Concept of multiple target
tables pushed up into execMain, replacing bug-prone implementation within
nodeAppend.  Planner now supports generating Append plans for inheritance
sets either at the top of the plan (the old way) or at the bottom.  Expanding
at the bottom is appropriate for tables used as sources, since they may
appear inside an outer join; but we must still expand at the top when the
target of an UPDATE or DELETE is an inheritance set, because we actually need
a different targetlist and junkfilter for each target table in that case.
Fortunately a target table can't be inside an outer join...  Bizarre mutual
recursion between union_planner and prepunion.c is gone --- in fact,
union_planner doesn't really have much to do with union queries anymore,
so I renamed it grouping_planner.
@
text
@d359 7
a365 3
So we can always make this deduction.  If we postponed filtering by qual
clauses then we'd not be able to assume pathkey equivalence until after
the equality check(s) had been applied.)
d422 8
a429 12
equivalence set it comes from.  (In practice, we simply return a pointer
to the relevant equivalence set without building any new sublist at all.
Each equivalence set becomes a "canonical pathkey" for all its members.)
This makes comparing pathkeys very simple and fast, and saves a lot of
work and memory space for pathkey construction as well.

Note that pathkey sublists having just one item still exist, and are
not expected to be equal() to any equivalence set.  This occurs when
we describe a sort order that involves a var that's not mentioned in
any equijoin clause of the WHERE.  We could add singleton sets containing
such vars to the query's list of equivalence sets, but there's little
point in doing so.
d471 11
@


1.19
log
@Subselects in FROM clause, per ISO syntax: FROM (SELECT ...) [AS] alias.
(Don't forget that an alias is required.)  Views reimplemented as expanding
to subselect-in-FROM.  Grouping, aggregates, DISTINCT in views actually
work now (he says optimistically).  No UNION support in subselects/views
yet, but I have some ideas about that.  Rule-related permissions checking
moved out of rewriter and into executor.
INITDB REQUIRED!
@
text
@d7 5
a11 5
tables, and /prep handles special cases like inheritance.  /util is utility
stuff.  /geqo is the separate "genetic optimization" planner --- it does
a semi-random search through the join tree space, rather than exhaustively
considering all possible join trees.  (But each join considered by /geqo
is given to /path to create paths for, so we consider all possible
d213 4
a216 4
--union_planner()
  handle unions and inheritance by mutual recursion with prepunion.c routines
  preprocess target list
  handle GROUP BY, HAVING, aggregates, ORDER BY, DISTINCT
d242 1
a242 1
 Back at union_planner:
d247 1
@


1.18
log
@First cut at full support for OUTER JOINs.  There are still a few loose
ends to clean up (see my message of same date to pghackers), but mostly
it works.  INITDB REQUIRED!
@
text
@d12 49
a60 1
implementation paths for each specific join even in GEQO mode.)
d67 2
a68 10
exhaustive search through the ways of executing the query.  During
the planning/optimizing process, we build "Path" trees representing
the different ways of doing a query.  We select the cheapest Path
that generates the desired relation and turn it into a Plan to pass
to the executor.  (There is pretty much a one-to-one correspondence
between the Path and Plan trees, but Path nodes omit info that won't
be needed during planning, and include info needed for planning that
won't be needed by the executor.)

The best Path tree is found by a recursive process:
d87 22
a108 13
2) Consider joining each RelOptInfo to each other RelOptInfo specified in
its RelOptInfo.joininfo, and generate a Path for each possible join method.
(If we have a RelOptInfo with no join clauses, we have no choice but to
generate a clauseless Cartesian-product join; so we consider joining that
rel to each other available rel.  But in the presence of join clauses we
will only consider joins that use available join clauses.)

At this stage each input RelOptInfo is a single relation, so we are joining
every relation to the other relations as joined in the WHERE clause.  We
generate a new "join" RelOptInfo for each possible combination of two
"base" RelOptInfos, and put all the plausible paths for that combination
into the join RelOptInfo's pathlist.  (As before, we keep only the cheapest
alternative that generates any one sort ordering of the result.)
d110 2
a111 17
Joins always occur using two RelOptInfos.  One is outer, the other inner.
Outers drive lookups of values in the inner.  In a nested loop, lookups of
values in the inner occur by scanning the inner path once per outer tuple
to find each matching inner row.  In a mergejoin, inner and outer rows are
ordered, and are accessed in order, so only one scan is required to perform
the entire join: both inner and outer paths are scanned in-sync.  (There's
not a lot of difference between inner and outer in a mergejoin...)  In a
hashjoin, the inner is scanned first and all its rows are entered in a
hashtable, then the outer is scanned and for each row we lookup the join
key in the hashtable.

A Path for a join relation is actually a tree structure, with the top
Path node representing the join method.  It has left and right subpaths
that represent the scan methods used for the two input relations.

3) If we only had two base relations, we are done: we just pick the
cheapest path for the join RelOptInfo.  If we had more than two, we now
d113 1
a113 1
join RelOptInfos that represent more than two base relations.
d117 4
a120 4
representing exactly two base relations.  The second pass considers ways
to make join rels that represent exactly three base relations; the next pass,
four relations, etc.  The last pass considers how to make the final join
relation that includes all base rels --- obviously there can be only one
d151 9
a159 9
but the inner is always a base rel); right-handed plans (outer rel is always
a base rel); and bushy plans (both inner and outer can be joins themselves).
For example, when building {1 2 3 4} we consider joining {1 2 3} to {4}
(left-handed), {4} to {1 2 3} (right-handed), and {1 2} to {3 4} (bushy),
among other choices.  Although the jointree scanning code produces these
potential join combinations one at a time, all the ways to produce the
same set of joined base rels will share the same RelOptInfo, so the paths
produced from different join combinations that produce equivalent joinrels
will compete in add_path.
d165 25
a189 12
The above dynamic-programming search is only conducted for simple cross
joins (ie, SELECT FROM tab1, tab2, ...).  When the FROM clause contains
explicit JOIN clauses, we join rels in exactly the order implied by the
join tree.  Searching for the best possible join order is done only at
the top implicit-cross-join level.  For example, in
	SELECT FROM tab1, tab2, (tab3 NATURAL JOIN tab4)
we will always join tab3 to tab4 and then consider all ways to join that
result to tab1 and tab2.  Note that the JOIN syntax only constrains the
order of joining --- we will still consider all available Paths and
join methods for each JOIN operator.  We also consider both sides of
the JOIN operator as inner or outer (so that we can transform RIGHT JOIN
into LEFT JOIN).
d201 1
d218 4
a221 3
   pull out constants from target list
   get a target list that only contains column names, no expressions
   if none, then return
d225 1
a225 1
    find relation clauses that can do merge sort and hash joins
d240 5
a244 4
   do group(GROUP)
   do aggregate
   put back constants
   re-flatten target list
@


1.17
log
@Deduce equality constraints that are implied by transitivity of
mergejoinable qual clauses, and add them to the query quals.  For
example, WHERE a = b AND b = c will cause us to add AND a = c.
This is necessary to ensure that it's safe to use these variables
as interchangeable sort keys, which is something 7.0 knows how to do.
Should provide a useful improvement in planning ability, too.
@
text
@d38 4
a41 4
nodes that list all the joins that involve this relation.  For example,
the WHERE clause "tab1.col1 = tab2.col1" generates a JoinInfo for tab1
listing tab2 as an unjoined relation, and also one for tab2 showing tab1
as an unjoined relation.
d131 13
d174 1
a174 2
    make list of relations in target
    make list of relations in where clause
d176 1
a176 1
    find relation clauses can do merge sort and hash joins
d179 1
a179 1
      find scan and all index paths for each relation
@


1.16
log
@Restructure planning code so that preprocessing of targetlist and quals
to simplify constant expressions and expand SubLink nodes into SubPlans
is done in a separate routine subquery_planner() that calls union_planner().
We formerly did most of this work in query_planner(), but that's the
wrong place because it may never see the real targetlist.  Splitting
union_planner into two routines also allows us to avoid redundant work
when union_planner is invoked recursively for UNION and inheritance
cases.  Upshot is that it is now possible to do something like
select float8(count(*)) / (select count(*) from int4_tbl)  from int4_tbl
group by f1;
which has never worked before.
@
text
@d10 1
a10 1
considering all possible join trees.  (But each join considered by geqo
d43 1
a43 1
If we have only a single base relation in the query, we are done here.
d228 182
a409 2
See path/pathkeys.c for an explanation of the PathKeys data structure that
represents what is known about the sort order of a particular Path.
@


1.15
log
@New cost model for planning, incorporating a penalty for random page
accesses versus sequential accesses, a (very crude) estimate of the
effects of caching on random page accesses, and cost to evaluate WHERE-
clause expressions.  Export critical parameters for this model as SET
variables.  Also, create SET variables for the planner's enable flags
(enable_seqscan, enable_indexscan, etc) so that these can be controlled
more conveniently than via PGOPTIONS.

Planner now estimates both startup cost (cost before retrieving
first tuple) and total cost of each path, so it can optimize queries
with LIMIT on a reasonable basis by interpolating between these costs.
Same facility is a win for EXISTS(...) subqueries and some other cases.

Redesign pathkey representation to achieve a major speedup in planning
(I saw as much as 5X on a 10-way join); also minor changes in planner
to reduce memory consumption by recycling discarded Path nodes and
not constructing unnecessary lists.

Minor cleanups to display more-plausible costs in some cases in
EXPLAIN output.

Initdb forced by change in interface to index cost estimation
functions.
@
text
@d135 2
d138 5
a142 7
 handle inheritance by processing separately
-init_query_planner()
  preprocess target list
  preprocess qualifications(WHERE)
--query_planner()
   simplify constant subexpressions
   canonicalize_qual()
d150 7
@


1.14
log
@Repair planning bugs caused by my misguided removal of restrictinfo link
fields in JoinPaths --- turns out that we do need that after all :-(.
Also, rearrange planner so that only one RelOptInfo is created for a
particular set of joined base relations, no matter how many different
subsets of relations it can be created from.  This saves memory and
processing time compared to the old method of making a bunch of RelOptInfos
and then removing the duplicates.  Clean up the jointree iteration logic;
not sure if it's better, but I sure find it more readable and plausible
now, particularly for the case of 'bushy plans'.
@
text
@d125 1
a125 1
will compete in add_pathlist.
@


1.13
log
@Major planner/optimizer revision: get rid of PathOrder node type,
store all ordering information in pathkeys lists (which are now lists of
lists of PathKeyItem nodes, not just lists of lists of vars).  This was
a big win --- the code is smaller and IMHO more understandable than it
was, even though it handles more cases.  I believe the node changes will
not force an initdb for anyone; planner nodes don't show up in stored
rules.
@
text
@d4 14
d43 4
d49 5
d79 11
a89 3
join RelOptInfos that represent more than two base relations.  This process
is repeated until we have finally built a RelOptInfo that represents all
the base relations in the query.  Then we pick its cheapest Path.
d103 1
d113 1
a113 1
    {1 2 3},{1 3 4},{1,2,4}
d116 15
a130 6
In the default left-handed joins, each RelOptInfo adds one
single-relation RelOptInfo in each join pass, and the added RelOptInfo
is always the inner relation in the join.  In right-handed joins, the
added RelOptInfo is the outer relation in the join.  In bushy plans,
multi-relation RelOptInfo's can be joined to other multi-relation
RelOptInfo's. 
a134 8
These directories take the Query structure returned by the parser, and
generate a plan used by the executor.  The /plan directory generates the
actual output plan, the /path code generates all possible ways to join the
tables, and /prep handles special cases like inheritance.  /util is utility
stuff.  /geqo is the separate "genetic optimization" planner --- it does
a semi-random search rather than exhaustively considering all possible
join trees.

d141 9
a149 23
   cnfify()
    Summary:

     Simple cases with all AND's are handled by removing the AND's:

     convert:   a = 1 AND b = 2 AND c = 3
     to:        a = 1, b = 2, c = 3

     Qualifications with OR's are handled differently.  OR's inside AND
     clauses are not modified drastically:

     convert:   a = 1 AND b = 2 AND (c = 3 OR d = 4)
     to:        a = 1, b = 2, c = 3 OR d = 4

     OR's in the upper level are more complex to handle:

     convert:   (a = 1 AND b = 2) OR c = 3
     to:        (a = 1 OR c = 3) AND (b = 2 OR c = 3)
     finally:   (a = 1 OR c = 3), (b = 2 OR c = 3)

     These clauses all have to be true for a result to be returned,
     so the optimizer can choose the most restrictive clauses.

d164 8
a171 14
      again:
       make_rels_by_joins():
        for each joinrel:
         make_rels_by_clause_joins()
          for each rel's joininfo list:
           if a join from the join clause adds only one relation, do the join
         or make_rels_by_clauseless_joins()
       update_rels_pathlist_for_joins()
        generate nested,merge,hash join paths for new rel's created above
       merge_rels_with_same_relids()
        merge RelOptInfo paths that have the same relids because of joins
       rels_set_cheapest()
        set cheapest path
       if all relations in one RelOptInfo, return
a179 1

d189 1
a189 1
  SeqScan	- a plain Path node with nodeTag = T_SeqScan
d195 1
a195 1
 PathKeys	- a data structure representing the ordering of a path
d207 5
a211 8
ORDER BY clause if the final path has the right ordering already.
Currently, this is not very well implemented: we avoid generating a
redundant sort if the chosen path has the desired order, but we do not do
anything to encourage the selection of such a path --- so we only avoid the
sort if the path that would be chosen anyway (because it is cheapest
without regard to its ordering) is properly sorted.  The path winnowing
process needs to be aware of the desired output order and account for the
cost of doing an explicit sort while it is choosing the best path.
d216 4
a219 3
achieve a given sort order.  In this way, the next level up will have the
maximum freedom to build mergejoins without sorting, since it can pick from
any of the paths retained for its inputs.
@


1.12
log
@optimizer cleanup
@
text
@d4 55
a58 1
The optimizer generates optimial query plans by doing several steps:
d60 1
a60 29
1) Take each relation in a query, and make a RelOptInfo structure for
it.  Find each way of accessing the relation, called a Path, including
sequential and index scans, and add it to RelOptInfo.pathlist.  Also
create RelOptInfo.joininfo that lists all the joins that involve this
relation.  For example, the WHERE clause "tab1.col1 = tab2.col1"
generates a JoinInfo for tab1 listing tab2 as an unjoined relation, and
tab2's joininfo shows tab1 as an unjoined relation.

2) Join each RelOptInfo to other RelOptInfo as specified in
RelOptInfo.joininfo.  At this point each RelOptInfo is a single
relation, so you are joining every relation to the other relations as
joined in the WHERE clause.

Joins occur using two RelOptInfos.  One is outer, the other inner. 
Outers drive lookups of values in the inner.  In a nested loop, lookups
of values in the inner occur by scanning to find each matching inner
row.  In a mergejoin, inner and outer rows are ordered, and are accessed
in order, so only one scan of inner is required to perform the entire
join.  In a hashjoin, inner rows are hashed for lookups.

Each unique join combination becomes a new RelOptInfo.  The RelOptInfo
is now the joining of two relations.  RelOptInfo.pathlist are various
paths to create the joined result, having different orderings depending
on the join method used.

3) At this point, every RelOptInfo is joined to each other again, with
a new relation added to each RelOptInfo.  This continues until all
relations have been joined into one RelOptInfo, and the cheapest Path is
chosen.
d96 5
a100 2
plan, the /path generates all possible ways to join the tables, and
/prep handles special cases like inheritance.  /utils is utility stuff.
d168 2
a169 2
Optimizer Structures
--------------------
d177 1
d179 1
a179 1
  NestPath      - nested joins
d183 30
a212 1
 PathOrder      - every ordering type (sort, merge of relations)
@


1.11
log
@optimizer cleanup
@
text
@d10 3
a12 1
relation.
@


1.10
log
@Update optimizer readme.
@
text
@d8 8
a15 6
sequential and index scans, and add it to RelOptInfo.pathlist.

2) Join each RelOptInfo to each other RelOptInfo as specified in the
WHERE clause.  At this point each RelOptInfo is a single relation, so
you are joining every relation to every relation as joined in the WHERE
clause.
d142 2
a143 2
 RestrictInfo   - restriction clauses
 JoinInfo       - join clauses
@


1.9
log
@Major optimizer improvement for joining a large number of tables.
@
text
@d6 3
a8 4
1) Take each relation in a query, and make a RelOptInfo structure for it. 
Find each way of accessing the relation, called a Path, including
sequential and index scans, and add it to the RelOptInfo.path_order
list.
d12 2
a13 2
you are joining every relation to every relation it is joined to in the
WHERE clause.
d18 3
a20 3
row.  In a mergejoin, inner rows are ordered, and are accessed in order,
so only one scan of inner is required to perform the entire join.  In a
hashjoin, inner rows are hashed for lookups.
d23 1
a23 1
is now the joining of two relations.  RelOptInfo.path_order are various
d32 28
a59 21
	SELECT 	*
	FROM 	tab1, tab2, tab3, tab4
	WHERE 	tab1.col = tab2.col AND
		tab2.col = tab3.col AND
		tab3.col = tab4.col

	Tables 1, 2, 3, and 4 are joined as:
	{1 2},{2 3},{3 4}
	{1 2 3},{2 3 4}
	{1 2 3 4}

	SELECT 	*
	FROM 	tab1, tab2, tab3, tab4
	WHERE 	tab1.col = tab2.col AND
		tab1.col = tab3.col AND
		tab1.col = tab4.col

	Tables 1, 2, 3, and 4 are joined as:
	{1 2},{1 3},{1 4}
	{1 2 3},{1 3 4},{1,2,4}
	{1 2 3 4}
d104 7
a110 7
     split up the qual into restrictions (a=1) and joins (b=c)
    find which relations can do merge sort and hash joins
----find_paths()
     find scan and all index paths for each relation not yet joined
     one relation, return
     find selectivity of columns used in joins
-----find_join_paths()
d113 1
a113 1
       find_join_rels():
d115 2
a116 2
         find_clause_joins()
          for each join on joinrel:
d118 8
a125 8
         or find_clauseless_joins()
       find_all_join_paths()
        generate paths(nested,sortmerge) for joins found in find_join_rels()
       prune_joinrels()
        remove from the join list the relation we just added to each join
       prune_rel_paths()
        set cheapest and perhaps remove unordered path, recompute table sizes
       if we have not done all the tables, go to again:
d138 1
a138 1
RelOptInfo		- Every relation
d140 2
a141 2
 RestrictInfo	- restriction clauses
 JoinInfo		- join combinations
d143 5
a147 8
 Path			- every way to access a relation(sequential, index)
  IndexPath		- index scans

  JoinPath		- joins
   MergePath	- merge joins
   HashPath		- hash joins

 PathOrder		- every ordering type (sort, merge of relations)
d149 1
@


1.8
log
@Optimizer cleanup.
@
text
@d1 3
d6 1
a6 1
Take each relation in a query, and make a RelOptInfo structure for it. 
d11 43
a104 22
      Summary:  With OPTIMIZER_DEBUG defined, you see:

      Tables 1, 2, 3, and 4 are joined as:
         {1 2},{1 3},{1 4},{2 3},{2 4}
         {1 2 3},{1 2 4},{2 3 4}
         {1 2 3 4}

      Actual output tests show combinations:
         {4 2},{3 2},{1 4},{1 3},{1 2}
         {4 2 3},{1 4 2},{1 3 2}
         {4 2 3 1}

      Cheapest join order shows:
         {4 2},{3 2},{1 4},{1 3},{1 2}
         {3 2 4},{1 4 2},{1 3 2}
         {1 4 2 3}

      It first finds the best way to join each table to every other
      table.  It then takes those joined table combinations, and joins
      them to the other joined table combinations, until all tables are
      joined.

@


1.7
log
@More optimizer cleanups.
@
text
@d1 8
@


1.6
log
@More optimizer renaming HInfo -> HashInfo.
@
text
@d1 3
d99 14
a112 5
RelOptInfo		- info about every relation
  RestrictInfo	- info about restrictions
  JoinInfo		- info about join combinations
  Path			- info about every way to access a relation(sequential, index)
    PathOrder 	- info about every ordering (sort, merge of relations)
@


1.5
log
@Optimizer rename ClauseInfo -> RestrictInfo.  Update optimizer README.
@
text
@d96 6
a101 5
Path		- info about every way to access a relation(sequential, index)
  PathOrder - info about every ordering (sort, merge of relations)
RelOptInfo	- info about every relation 
  JoinInfo	- info about join combinations
RestrictInfo	- info about restrictions
@


1.4
log
@Cleanup optimizer function names and clarify code.
@
text
@d1 1
a1 1
Thse directories take the Query structure returned by the parser, and
d84 1
a84 1
       if we have not done all the tables, go to "again"
d91 10
@


1.3
log
@OPTIMIZER_DEBUG additions.
@
text
@d12 23
a34 1
   cnfify qualification, so qual are expressions (were AND's) and OR clauses
d41 1
@


1.2
log
@Cost cleanup.
@
text
@d12 1
@


1.1
log
@Add optimizer README file.
@
text
@d55 1
a55 1
        generate paths(nested,mergesort) for joins found in find_join_rels()
@
