Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * cluster.c
4 : * CLUSTER a table on an index. This is now also used for VACUUM FULL.
5 : *
6 : * There is hardly anything left of Paul Brown's original implementation...
7 : *
8 : *
9 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
10 : * Portions Copyright (c) 1994-5, Regents of the University of California
11 : *
12 : *
13 : * IDENTIFICATION
14 : * src/backend/commands/cluster.c
15 : *
16 : *-------------------------------------------------------------------------
17 : */
18 : #include "postgres.h"
19 :
20 : #include "access/amapi.h"
21 : #include "access/heapam.h"
22 : #include "access/multixact.h"
23 : #include "access/relscan.h"
24 : #include "access/tableam.h"
25 : #include "access/toast_internals.h"
26 : #include "access/transam.h"
27 : #include "access/xact.h"
28 : #include "access/xlog.h"
29 : #include "catalog/catalog.h"
30 : #include "catalog/dependency.h"
31 : #include "catalog/heap.h"
32 : #include "catalog/index.h"
33 : #include "catalog/namespace.h"
34 : #include "catalog/objectaccess.h"
35 : #include "catalog/partition.h"
36 : #include "catalog/pg_am.h"
37 : #include "catalog/pg_database.h"
38 : #include "catalog/pg_inherits.h"
39 : #include "catalog/toasting.h"
40 : #include "commands/cluster.h"
41 : #include "commands/defrem.h"
42 : #include "commands/progress.h"
43 : #include "commands/tablecmds.h"
44 : #include "commands/vacuum.h"
45 : #include "miscadmin.h"
46 : #include "optimizer/optimizer.h"
47 : #include "pgstat.h"
48 : #include "storage/bufmgr.h"
49 : #include "storage/lmgr.h"
50 : #include "storage/predicate.h"
51 : #include "utils/acl.h"
52 : #include "utils/fmgroids.h"
53 : #include "utils/guc.h"
54 : #include "utils/inval.h"
55 : #include "utils/lsyscache.h"
56 : #include "utils/memutils.h"
57 : #include "utils/pg_rusage.h"
58 : #include "utils/relmapper.h"
59 : #include "utils/snapmgr.h"
60 : #include "utils/syscache.h"
61 : #include "utils/tuplesort.h"
62 :
63 : /*
64 : * This struct is used to pass around the information on tables to be
65 : * clustered. We need this so we can make a list of them when invoked without
66 : * a specific table/index pair.
67 : */
68 : typedef struct
69 : {
70 : Oid tableOid;
71 : Oid indexOid;
72 : } RelToCluster;
73 :
74 :
75 : static void cluster_multiple_rels(List *rtcs, ClusterParams *params);
76 : static void rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose);
77 : static void copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
78 : bool verbose, bool *pSwapToastByContent,
79 : TransactionId *pFreezeXid, MultiXactId *pCutoffMulti);
80 : static List *get_tables_to_cluster(MemoryContext cluster_context);
81 : static List *get_tables_to_cluster_partitioned(MemoryContext cluster_context,
82 : Oid indexOid);
83 : static bool cluster_is_permitted_for_relation(Oid relid, Oid userid);
84 :
85 :
86 : /*---------------------------------------------------------------------------
87 : * This cluster code allows for clustering multiple tables at once. Because
88 : * of this, we cannot just run everything on a single transaction, or we
89 : * would be forced to acquire exclusive locks on all the tables being
90 : * clustered, simultaneously --- very likely leading to deadlock.
91 : *
92 : * To solve this we follow a similar strategy to VACUUM code,
93 : * clustering each relation in a separate transaction. For this to work,
94 : * we need to:
95 : * - provide a separate memory context so that we can pass information in
96 : * a way that survives across transactions
97 : * - start a new transaction every time a new relation is clustered
98 : * - check for validity of the information on to-be-clustered relations,
99 : * as someone might have deleted a relation behind our back, or
100 : * clustered one on a different index
101 : * - end the transaction
102 : *
103 : * The single-relation case does not have any such overhead.
104 : *
105 : * We also allow a relation to be specified without index. In that case,
106 : * the indisclustered bit will be looked up, and an ERROR will be thrown
107 : * if there is no index with the bit set.
108 : *---------------------------------------------------------------------------
109 : */
110 : void
111 218 : cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel)
112 : {
113 : ListCell *lc;
114 218 : ClusterParams params = {0};
115 218 : bool verbose = false;
116 218 : Relation rel = NULL;
117 218 : Oid indexOid = InvalidOid;
118 : MemoryContext cluster_context;
119 : List *rtcs;
120 :
121 : /* Parse option list */
122 234 : foreach(lc, stmt->params)
123 : {
124 16 : DefElem *opt = (DefElem *) lfirst(lc);
125 :
126 16 : if (strcmp(opt->defname, "verbose") == 0)
127 16 : verbose = defGetBoolean(opt);
128 : else
129 0 : ereport(ERROR,
130 : (errcode(ERRCODE_SYNTAX_ERROR),
131 : errmsg("unrecognized CLUSTER option \"%s\"",
132 : opt->defname),
133 : parser_errposition(pstate, opt->location)));
134 : }
135 :
136 218 : params.options = (verbose ? CLUOPT_VERBOSE : 0);
137 :
138 218 : if (stmt->relation != NULL)
139 : {
140 : /* This is the single-relation case. */
141 : Oid tableOid;
142 :
143 : /*
144 : * Find, lock, and check permissions on the table. We obtain
145 : * AccessExclusiveLock right away to avoid lock-upgrade hazard in the
146 : * single-transaction case.
147 : */
148 190 : tableOid = RangeVarGetRelidExtended(stmt->relation,
149 : AccessExclusiveLock,
150 : 0,
151 : RangeVarCallbackMaintainsTable,
152 : NULL);
153 184 : rel = table_open(tableOid, NoLock);
154 :
155 : /*
156 : * Reject clustering a remote temp table ... their local buffer
157 : * manager is not going to cope.
158 : */
159 184 : if (RELATION_IS_OTHER_TEMP(rel))
160 0 : ereport(ERROR,
161 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
162 : errmsg("cannot cluster temporary tables of other sessions")));
163 :
164 184 : if (stmt->indexname == NULL)
165 : {
166 : ListCell *index;
167 :
168 : /* We need to find the index that has indisclustered set. */
169 44 : foreach(index, RelationGetIndexList(rel))
170 : {
171 32 : indexOid = lfirst_oid(index);
172 32 : if (get_index_isclustered(indexOid))
173 20 : break;
174 12 : indexOid = InvalidOid;
175 : }
176 :
177 32 : if (!OidIsValid(indexOid))
178 12 : ereport(ERROR,
179 : (errcode(ERRCODE_UNDEFINED_OBJECT),
180 : errmsg("there is no previously clustered index for table \"%s\"",
181 : stmt->relation->relname)));
182 : }
183 : else
184 : {
185 : /*
186 : * The index is expected to be in the same namespace as the
187 : * relation.
188 : */
189 152 : indexOid = get_relname_relid(stmt->indexname,
190 152 : rel->rd_rel->relnamespace);
191 152 : if (!OidIsValid(indexOid))
192 0 : ereport(ERROR,
193 : (errcode(ERRCODE_UNDEFINED_OBJECT),
194 : errmsg("index \"%s\" for table \"%s\" does not exist",
195 : stmt->indexname, stmt->relation->relname)));
196 : }
197 :
198 172 : if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
199 : {
200 : /* close relation, keep lock till commit */
201 146 : table_close(rel, NoLock);
202 :
203 : /* Do the job. */
204 146 : cluster_rel(tableOid, indexOid, ¶ms);
205 :
206 146 : return;
207 : }
208 : }
209 :
210 : /*
211 : * By here, we know we are in a multi-table situation. In order to avoid
212 : * holding locks for too long, we want to process each table in its own
213 : * transaction. This forces us to disallow running inside a user
214 : * transaction block.
215 : */
216 54 : PreventInTransactionBlock(isTopLevel, "CLUSTER");
217 :
218 : /* Also, we need a memory context to hold our list of relations */
219 54 : cluster_context = AllocSetContextCreate(PortalContext,
220 : "Cluster",
221 : ALLOCSET_DEFAULT_SIZES);
222 :
223 : /*
224 : * Either we're processing a partitioned table, or we were not given any
225 : * table name at all. In either case, obtain a list of relations to
226 : * process.
227 : *
228 : * In the former case, an index name must have been given, so we don't
229 : * need to recheck its "indisclustered" bit, but we have to check that it
230 : * is an index that we can cluster on. In the latter case, we set the
231 : * option bit to have indisclustered verified.
232 : *
233 : * Rechecking the relation itself is necessary here in all cases.
234 : */
235 54 : params.options |= CLUOPT_RECHECK;
236 54 : if (rel != NULL)
237 : {
238 : Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
239 26 : check_index_is_clusterable(rel, indexOid, AccessShareLock);
240 20 : rtcs = get_tables_to_cluster_partitioned(cluster_context, indexOid);
241 :
242 : /* close relation, releasing lock on parent table */
243 20 : table_close(rel, AccessExclusiveLock);
244 : }
245 : else
246 : {
247 28 : rtcs = get_tables_to_cluster(cluster_context);
248 28 : params.options |= CLUOPT_RECHECK_ISCLUSTERED;
249 : }
250 :
251 : /* Do the job. */
252 48 : cluster_multiple_rels(rtcs, ¶ms);
253 :
254 : /* Start a new transaction for the cleanup work. */
255 48 : StartTransactionCommand();
256 :
257 : /* Clean up working storage */
258 48 : MemoryContextDelete(cluster_context);
259 : }
260 :
261 : /*
262 : * Given a list of relations to cluster, process each of them in a separate
263 : * transaction.
264 : *
265 : * We expect to be in a transaction at start, but there isn't one when we
266 : * return.
267 : */
268 : static void
269 48 : cluster_multiple_rels(List *rtcs, ClusterParams *params)
270 : {
271 : ListCell *lc;
272 :
273 : /* Commit to get out of starting transaction */
274 48 : PopActiveSnapshot();
275 48 : CommitTransactionCommand();
276 :
277 : /* Cluster the tables, each in a separate transaction */
278 100 : foreach(lc, rtcs)
279 : {
280 52 : RelToCluster *rtc = (RelToCluster *) lfirst(lc);
281 :
282 : /* Start a new transaction for each relation. */
283 52 : StartTransactionCommand();
284 :
285 : /* functions in indexes may want a snapshot set */
286 52 : PushActiveSnapshot(GetTransactionSnapshot());
287 :
288 : /* Do the job. */
289 52 : cluster_rel(rtc->tableOid, rtc->indexOid, params);
290 :
291 52 : PopActiveSnapshot();
292 52 : CommitTransactionCommand();
293 : }
294 48 : }
295 :
296 : /*
297 : * cluster_rel
298 : *
299 : * This clusters the table by creating a new, clustered table and
300 : * swapping the relfilenumbers of the new table and the old table, so
301 : * the OID of the original table is preserved. Thus we do not lose
302 : * GRANT, inheritance nor references to this table (this was a bug
303 : * in releases through 7.3).
304 : *
305 : * Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading
306 : * the new table, it's better to create the indexes afterwards than to fill
307 : * them incrementally while we load the table.
308 : *
309 : * If indexOid is InvalidOid, the table will be rewritten in physical order
310 : * instead of index order. This is the new implementation of VACUUM FULL,
311 : * and error messages should refer to the operation as VACUUM not CLUSTER.
312 : */
313 : void
314 526 : cluster_rel(Oid tableOid, Oid indexOid, ClusterParams *params)
315 : {
316 : Relation OldHeap;
317 : Oid save_userid;
318 : int save_sec_context;
319 : int save_nestlevel;
320 526 : bool verbose = ((params->options & CLUOPT_VERBOSE) != 0);
321 526 : bool recheck = ((params->options & CLUOPT_RECHECK) != 0);
322 :
323 : /* Check for user-requested abort. */
324 526 : CHECK_FOR_INTERRUPTS();
325 :
326 526 : pgstat_progress_start_command(PROGRESS_COMMAND_CLUSTER, tableOid);
327 526 : if (OidIsValid(indexOid))
328 198 : pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
329 : PROGRESS_CLUSTER_COMMAND_CLUSTER);
330 : else
331 328 : pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
332 : PROGRESS_CLUSTER_COMMAND_VACUUM_FULL);
333 :
334 : /*
335 : * We grab exclusive access to the target rel and index for the duration
336 : * of the transaction. (This is redundant for the single-transaction
337 : * case, since cluster() already did it.) The index lock is taken inside
338 : * check_index_is_clusterable.
339 : */
340 526 : OldHeap = try_relation_open(tableOid, AccessExclusiveLock);
341 :
342 : /* If the table has gone away, we can skip processing it */
343 526 : if (!OldHeap)
344 : {
345 0 : pgstat_progress_end_command();
346 0 : return;
347 : }
348 :
349 : /*
350 : * Switch to the table owner's userid, so that any index functions are run
351 : * as that user. Also lock down security-restricted operations and
352 : * arrange to make GUC variable changes local to this command.
353 : */
354 526 : GetUserIdAndSecContext(&save_userid, &save_sec_context);
355 526 : SetUserIdAndSecContext(OldHeap->rd_rel->relowner,
356 : save_sec_context | SECURITY_RESTRICTED_OPERATION);
357 526 : save_nestlevel = NewGUCNestLevel();
358 :
359 : /*
360 : * Since we may open a new transaction for each relation, we have to check
361 : * that the relation still is what we think it is.
362 : *
363 : * If this is a single-transaction CLUSTER, we can skip these tests. We
364 : * *must* skip the one on indisclustered since it would reject an attempt
365 : * to cluster a not-previously-clustered index.
366 : */
367 526 : if (recheck)
368 : {
369 : /* Check that the user still has privileges for the relation */
370 52 : if (!cluster_is_permitted_for_relation(tableOid, save_userid))
371 : {
372 0 : relation_close(OldHeap, AccessExclusiveLock);
373 0 : goto out;
374 : }
375 :
376 : /*
377 : * Silently skip a temp table for a remote session. Only doing this
378 : * check in the "recheck" case is appropriate (which currently means
379 : * somebody is executing a database-wide CLUSTER or on a partitioned
380 : * table), because there is another check in cluster() which will stop
381 : * any attempt to cluster remote temp tables by name. There is
382 : * another check in cluster_rel which is redundant, but we leave it
383 : * for extra safety.
384 : */
385 52 : if (RELATION_IS_OTHER_TEMP(OldHeap))
386 : {
387 0 : relation_close(OldHeap, AccessExclusiveLock);
388 0 : goto out;
389 : }
390 :
391 52 : if (OidIsValid(indexOid))
392 : {
393 : /*
394 : * Check that the index still exists
395 : */
396 52 : if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid)))
397 : {
398 0 : relation_close(OldHeap, AccessExclusiveLock);
399 0 : goto out;
400 : }
401 :
402 : /*
403 : * Check that the index is still the one with indisclustered set,
404 : * if needed.
405 : */
406 52 : if ((params->options & CLUOPT_RECHECK_ISCLUSTERED) != 0 &&
407 6 : !get_index_isclustered(indexOid))
408 : {
409 0 : relation_close(OldHeap, AccessExclusiveLock);
410 0 : goto out;
411 : }
412 : }
413 : }
414 :
415 : /*
416 : * We allow VACUUM FULL, but not CLUSTER, on shared catalogs. CLUSTER
417 : * would work in most respects, but the index would only get marked as
418 : * indisclustered in the current database, leading to unexpected behavior
419 : * if CLUSTER were later invoked in another database.
420 : */
421 526 : if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared)
422 0 : ereport(ERROR,
423 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
424 : errmsg("cannot cluster a shared catalog")));
425 :
426 : /*
427 : * Don't process temp tables of other backends ... their local buffer
428 : * manager is not going to cope.
429 : */
430 526 : if (RELATION_IS_OTHER_TEMP(OldHeap))
431 : {
432 0 : if (OidIsValid(indexOid))
433 0 : ereport(ERROR,
434 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
435 : errmsg("cannot cluster temporary tables of other sessions")));
436 : else
437 0 : ereport(ERROR,
438 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
439 : errmsg("cannot vacuum temporary tables of other sessions")));
440 : }
441 :
442 : /*
443 : * Also check for active uses of the relation in the current transaction,
444 : * including open scans and pending AFTER trigger events.
445 : */
446 526 : CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM");
447 :
448 : /* Check heap and index are valid to cluster on */
449 526 : if (OidIsValid(indexOid))
450 198 : check_index_is_clusterable(OldHeap, indexOid, AccessExclusiveLock);
451 :
452 : /*
453 : * Quietly ignore the request if this is a materialized view which has not
454 : * been populated from its query. No harm is done because there is no data
455 : * to deal with, and we don't want to throw an error if this is part of a
456 : * multi-relation request -- for example, CLUSTER was run on the entire
457 : * database.
458 : */
459 526 : if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW &&
460 0 : !RelationIsPopulated(OldHeap))
461 : {
462 0 : relation_close(OldHeap, AccessExclusiveLock);
463 0 : goto out;
464 : }
465 :
466 : Assert(OldHeap->rd_rel->relkind == RELKIND_RELATION ||
467 : OldHeap->rd_rel->relkind == RELKIND_MATVIEW ||
468 : OldHeap->rd_rel->relkind == RELKIND_TOASTVALUE);
469 :
470 : /*
471 : * All predicate locks on the tuples or pages are about to be made
472 : * invalid, because we move tuples around. Promote them to relation
473 : * locks. Predicate locks on indexes will be promoted when they are
474 : * reindexed.
475 : */
476 526 : TransferPredicateLocksToHeapRelation(OldHeap);
477 :
478 : /* rebuild_relation does all the dirty work */
479 526 : rebuild_relation(OldHeap, indexOid, verbose);
480 :
481 : /* NB: rebuild_relation does table_close() on OldHeap */
482 :
483 520 : out:
484 : /* Roll back any GUC changes executed by index functions */
485 520 : AtEOXact_GUC(false, save_nestlevel);
486 :
487 : /* Restore userid and security context */
488 520 : SetUserIdAndSecContext(save_userid, save_sec_context);
489 :
490 520 : pgstat_progress_end_command();
491 : }
492 :
493 : /*
494 : * Verify that the specified heap and index are valid to cluster on
495 : *
496 : * Side effect: obtains lock on the index. The caller may
497 : * in some cases already have AccessExclusiveLock on the table, but
498 : * not in all cases so we can't rely on the table-level lock for
499 : * protection here.
500 : */
501 : void
502 288 : check_index_is_clusterable(Relation OldHeap, Oid indexOid, LOCKMODE lockmode)
503 : {
504 : Relation OldIndex;
505 :
506 288 : OldIndex = index_open(indexOid, lockmode);
507 :
508 : /*
509 : * Check that index is in fact an index on the given relation
510 : */
511 288 : if (OldIndex->rd_index == NULL ||
512 288 : OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap))
513 0 : ereport(ERROR,
514 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
515 : errmsg("\"%s\" is not an index for table \"%s\"",
516 : RelationGetRelationName(OldIndex),
517 : RelationGetRelationName(OldHeap))));
518 :
519 : /* Index AM must allow clustering */
520 288 : if (!OldIndex->rd_indam->amclusterable)
521 0 : ereport(ERROR,
522 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
523 : errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
524 : RelationGetRelationName(OldIndex))));
525 :
526 : /*
527 : * Disallow clustering on incomplete indexes (those that might not index
528 : * every row of the relation). We could relax this by making a separate
529 : * seqscan pass over the table to copy the missing rows, but that seems
530 : * expensive and tedious.
531 : */
532 288 : if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred, NULL))
533 0 : ereport(ERROR,
534 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
535 : errmsg("cannot cluster on partial index \"%s\"",
536 : RelationGetRelationName(OldIndex))));
537 :
538 : /*
539 : * Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY;
540 : * it might well not contain entries for every heap row, or might not even
541 : * be internally consistent. (But note that we don't check indcheckxmin;
542 : * the worst consequence of following broken HOT chains would be that we
543 : * might put recently-dead tuples out-of-order in the new table, and there
544 : * is little harm in that.)
545 : */
546 288 : if (!OldIndex->rd_index->indisvalid)
547 6 : ereport(ERROR,
548 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
549 : errmsg("cannot cluster on invalid index \"%s\"",
550 : RelationGetRelationName(OldIndex))));
551 :
552 : /* Drop relcache refcnt on OldIndex, but keep lock */
553 282 : index_close(OldIndex, NoLock);
554 282 : }
555 :
556 : /*
557 : * mark_index_clustered: mark the specified index as the one clustered on
558 : *
559 : * With indexOid == InvalidOid, will mark all indexes of rel not-clustered.
560 : */
561 : void
562 280 : mark_index_clustered(Relation rel, Oid indexOid, bool is_internal)
563 : {
564 : HeapTuple indexTuple;
565 : Form_pg_index indexForm;
566 : Relation pg_index;
567 : ListCell *index;
568 :
569 : /* Disallow applying to a partitioned table */
570 280 : if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
571 12 : ereport(ERROR,
572 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
573 : errmsg("cannot mark index clustered in partitioned table")));
574 :
575 : /*
576 : * If the index is already marked clustered, no need to do anything.
577 : */
578 268 : if (OidIsValid(indexOid))
579 : {
580 256 : if (get_index_isclustered(indexOid))
581 38 : return;
582 : }
583 :
584 : /*
585 : * Check each index of the relation and set/clear the bit as needed.
586 : */
587 230 : pg_index = table_open(IndexRelationId, RowExclusiveLock);
588 :
589 672 : foreach(index, RelationGetIndexList(rel))
590 : {
591 442 : Oid thisIndexOid = lfirst_oid(index);
592 :
593 442 : indexTuple = SearchSysCacheCopy1(INDEXRELID,
594 : ObjectIdGetDatum(thisIndexOid));
595 442 : if (!HeapTupleIsValid(indexTuple))
596 0 : elog(ERROR, "cache lookup failed for index %u", thisIndexOid);
597 442 : indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
598 :
599 : /*
600 : * Unset the bit if set. We know it's wrong because we checked this
601 : * earlier.
602 : */
603 442 : if (indexForm->indisclustered)
604 : {
605 30 : indexForm->indisclustered = false;
606 30 : CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
607 : }
608 412 : else if (thisIndexOid == indexOid)
609 : {
610 : /* this was checked earlier, but let's be real sure */
611 218 : if (!indexForm->indisvalid)
612 0 : elog(ERROR, "cannot cluster on invalid index %u", indexOid);
613 218 : indexForm->indisclustered = true;
614 218 : CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
615 : }
616 :
617 442 : InvokeObjectPostAlterHookArg(IndexRelationId, thisIndexOid, 0,
618 : InvalidOid, is_internal);
619 :
620 442 : heap_freetuple(indexTuple);
621 : }
622 :
623 230 : table_close(pg_index, RowExclusiveLock);
624 : }
625 :
626 : /*
627 : * rebuild_relation: rebuild an existing relation in index or physical order
628 : *
629 : * OldHeap: table to rebuild --- must be opened and exclusive-locked!
630 : * indexOid: index to cluster by, or InvalidOid to rewrite in physical order.
631 : *
632 : * NB: this routine closes OldHeap at the right time; caller should not.
633 : */
634 : static void
635 526 : rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose)
636 : {
637 526 : Oid tableOid = RelationGetRelid(OldHeap);
638 526 : Oid accessMethod = OldHeap->rd_rel->relam;
639 526 : Oid tableSpace = OldHeap->rd_rel->reltablespace;
640 : Oid OIDNewHeap;
641 : char relpersistence;
642 : bool is_system_catalog;
643 : bool swap_toast_by_content;
644 : TransactionId frozenXid;
645 : MultiXactId cutoffMulti;
646 :
647 526 : if (OidIsValid(indexOid))
648 : /* Mark the correct index as clustered */
649 198 : mark_index_clustered(OldHeap, indexOid, true);
650 :
651 : /* Remember info about rel before closing OldHeap */
652 526 : relpersistence = OldHeap->rd_rel->relpersistence;
653 526 : is_system_catalog = IsSystemRelation(OldHeap);
654 :
655 : /* Close relcache entry, but keep lock until transaction commit */
656 526 : table_close(OldHeap, NoLock);
657 :
658 : /* Create the transient table that will receive the re-ordered data */
659 526 : OIDNewHeap = make_new_heap(tableOid, tableSpace,
660 : accessMethod,
661 : relpersistence,
662 : AccessExclusiveLock);
663 :
664 : /* Copy the heap data into the new table in the desired order */
665 526 : copy_table_data(OIDNewHeap, tableOid, indexOid, verbose,
666 : &swap_toast_by_content, &frozenXid, &cutoffMulti);
667 :
668 : /*
669 : * Swap the physical files of the target and transient tables, then
670 : * rebuild the target's indexes and throw away the transient table.
671 : */
672 526 : finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
673 : swap_toast_by_content, false, true,
674 : frozenXid, cutoffMulti,
675 : relpersistence);
676 520 : }
677 :
678 :
679 : /*
680 : * Create the transient table that will be filled with new data during
681 : * CLUSTER, ALTER TABLE, and similar operations. The transient table
682 : * duplicates the logical structure of the OldHeap; but will have the
683 : * specified physical storage properties NewTableSpace, NewAccessMethod, and
684 : * relpersistence.
685 : *
686 : * After this, the caller should load the new heap with transferred/modified
687 : * data, then call finish_heap_swap to complete the operation.
688 : */
689 : Oid
690 1494 : make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, Oid NewAccessMethod,
691 : char relpersistence, LOCKMODE lockmode)
692 : {
693 : TupleDesc OldHeapDesc;
694 : char NewHeapName[NAMEDATALEN];
695 : Oid OIDNewHeap;
696 : Oid toastid;
697 : Relation OldHeap;
698 : HeapTuple tuple;
699 : Datum reloptions;
700 : bool isNull;
701 : Oid namespaceid;
702 :
703 1494 : OldHeap = table_open(OIDOldHeap, lockmode);
704 1494 : OldHeapDesc = RelationGetDescr(OldHeap);
705 :
706 : /*
707 : * Note that the NewHeap will not receive any of the defaults or
708 : * constraints associated with the OldHeap; we don't need 'em, and there's
709 : * no reason to spend cycles inserting them into the catalogs only to
710 : * delete them.
711 : */
712 :
713 : /*
714 : * But we do want to use reloptions of the old heap for new heap.
715 : */
716 1494 : tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(OIDOldHeap));
717 1494 : if (!HeapTupleIsValid(tuple))
718 0 : elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
719 1494 : reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
720 : &isNull);
721 1494 : if (isNull)
722 1456 : reloptions = (Datum) 0;
723 :
724 1494 : if (relpersistence == RELPERSISTENCE_TEMP)
725 140 : namespaceid = LookupCreationNamespace("pg_temp");
726 : else
727 1354 : namespaceid = RelationGetNamespace(OldHeap);
728 :
729 : /*
730 : * Create the new heap, using a temporary name in the same namespace as
731 : * the existing table. NOTE: there is some risk of collision with user
732 : * relnames. Working around this seems more trouble than it's worth; in
733 : * particular, we can't create the new heap in a different namespace from
734 : * the old, or we will have problems with the TEMP status of temp tables.
735 : *
736 : * Note: the new heap is not a shared relation, even if we are rebuilding
737 : * a shared rel. However, we do make the new heap mapped if the source is
738 : * mapped. This simplifies swap_relation_files, and is absolutely
739 : * necessary for rebuilding pg_class, for reasons explained there.
740 : */
741 1494 : snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap);
742 :
743 1494 : OIDNewHeap = heap_create_with_catalog(NewHeapName,
744 : namespaceid,
745 : NewTableSpace,
746 : InvalidOid,
747 : InvalidOid,
748 : InvalidOid,
749 1494 : OldHeap->rd_rel->relowner,
750 : NewAccessMethod,
751 : OldHeapDesc,
752 : NIL,
753 : RELKIND_RELATION,
754 : relpersistence,
755 : false,
756 1494 : RelationIsMapped(OldHeap),
757 : ONCOMMIT_NOOP,
758 : reloptions,
759 : false,
760 : true,
761 : true,
762 : OIDOldHeap,
763 : NULL);
764 : Assert(OIDNewHeap != InvalidOid);
765 :
766 1494 : ReleaseSysCache(tuple);
767 :
768 : /*
769 : * Advance command counter so that the newly-created relation's catalog
770 : * tuples will be visible to table_open.
771 : */
772 1494 : CommandCounterIncrement();
773 :
774 : /*
775 : * If necessary, create a TOAST table for the new relation.
776 : *
777 : * If the relation doesn't have a TOAST table already, we can't need one
778 : * for the new relation. The other way around is possible though: if some
779 : * wide columns have been dropped, NewHeapCreateToastTable can decide that
780 : * no TOAST table is needed for the new table.
781 : *
782 : * Note that NewHeapCreateToastTable ends with CommandCounterIncrement, so
783 : * that the TOAST table will be visible for insertion.
784 : */
785 1494 : toastid = OldHeap->rd_rel->reltoastrelid;
786 1494 : if (OidIsValid(toastid))
787 : {
788 : /* keep the existing toast table's reloptions, if any */
789 588 : tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(toastid));
790 588 : if (!HeapTupleIsValid(tuple))
791 0 : elog(ERROR, "cache lookup failed for relation %u", toastid);
792 588 : reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
793 : &isNull);
794 588 : if (isNull)
795 588 : reloptions = (Datum) 0;
796 :
797 588 : NewHeapCreateToastTable(OIDNewHeap, reloptions, lockmode, toastid);
798 :
799 588 : ReleaseSysCache(tuple);
800 : }
801 :
802 1494 : table_close(OldHeap, NoLock);
803 :
804 1494 : return OIDNewHeap;
805 : }
806 :
807 : /*
808 : * Do the physical copying of table data.
809 : *
810 : * There are three output parameters:
811 : * *pSwapToastByContent is set true if toast tables must be swapped by content.
812 : * *pFreezeXid receives the TransactionId used as freeze cutoff point.
813 : * *pCutoffMulti receives the MultiXactId used as a cutoff point.
814 : */
815 : static void
816 526 : copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
817 : bool *pSwapToastByContent, TransactionId *pFreezeXid,
818 : MultiXactId *pCutoffMulti)
819 : {
820 : Relation NewHeap,
821 : OldHeap,
822 : OldIndex;
823 : Relation relRelation;
824 : HeapTuple reltup;
825 : Form_pg_class relform;
826 : TupleDesc oldTupDesc PG_USED_FOR_ASSERTS_ONLY;
827 : TupleDesc newTupDesc PG_USED_FOR_ASSERTS_ONLY;
828 : VacuumParams params;
829 : struct VacuumCutoffs cutoffs;
830 : bool use_sort;
831 526 : double num_tuples = 0,
832 526 : tups_vacuumed = 0,
833 526 : tups_recently_dead = 0;
834 : BlockNumber num_pages;
835 526 : int elevel = verbose ? INFO : DEBUG2;
836 : PGRUsage ru0;
837 : char *nspname;
838 :
839 526 : pg_rusage_init(&ru0);
840 :
841 : /*
842 : * Open the relations we need.
843 : */
844 526 : NewHeap = table_open(OIDNewHeap, AccessExclusiveLock);
845 526 : OldHeap = table_open(OIDOldHeap, AccessExclusiveLock);
846 526 : if (OidIsValid(OIDOldIndex))
847 198 : OldIndex = index_open(OIDOldIndex, AccessExclusiveLock);
848 : else
849 328 : OldIndex = NULL;
850 :
851 : /* Store a copy of the namespace name for logging purposes */
852 526 : nspname = get_namespace_name(RelationGetNamespace(OldHeap));
853 :
854 : /*
855 : * Their tuple descriptors should be exactly alike, but here we only need
856 : * assume that they have the same number of columns.
857 : */
858 526 : oldTupDesc = RelationGetDescr(OldHeap);
859 526 : newTupDesc = RelationGetDescr(NewHeap);
860 : Assert(newTupDesc->natts == oldTupDesc->natts);
861 :
862 : /*
863 : * If the OldHeap has a toast table, get lock on the toast table to keep
864 : * it from being vacuumed. This is needed because autovacuum processes
865 : * toast tables independently of their main tables, with no lock on the
866 : * latter. If an autovacuum were to start on the toast table after we
867 : * compute our OldestXmin below, it would use a later OldestXmin, and then
868 : * possibly remove as DEAD toast tuples belonging to main tuples we think
869 : * are only RECENTLY_DEAD. Then we'd fail while trying to copy those
870 : * tuples.
871 : *
872 : * We don't need to open the toast relation here, just lock it. The lock
873 : * will be held till end of transaction.
874 : */
875 526 : if (OldHeap->rd_rel->reltoastrelid)
876 174 : LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);
877 :
878 : /*
879 : * If both tables have TOAST tables, perform toast swap by content. It is
880 : * possible that the old table has a toast table but the new one doesn't,
881 : * if toastable columns have been dropped. In that case we have to do
882 : * swap by links. This is okay because swap by content is only essential
883 : * for system catalogs, and we don't support schema changes for them.
884 : */
885 526 : if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid)
886 : {
887 174 : *pSwapToastByContent = true;
888 :
889 : /*
890 : * When doing swap by content, any toast pointers written into NewHeap
891 : * must use the old toast table's OID, because that's where the toast
892 : * data will eventually be found. Set this up by setting rd_toastoid.
893 : * This also tells toast_save_datum() to preserve the toast value
894 : * OIDs, which we want so as not to invalidate toast pointers in
895 : * system catalog caches, and to avoid making multiple copies of a
896 : * single toast value.
897 : *
898 : * Note that we must hold NewHeap open until we are done writing data,
899 : * since the relcache will not guarantee to remember this setting once
900 : * the relation is closed. Also, this technique depends on the fact
901 : * that no one will try to read from the NewHeap until after we've
902 : * finished writing it and swapping the rels --- otherwise they could
903 : * follow the toast pointers to the wrong place. (It would actually
904 : * work for values copied over from the old toast table, but not for
905 : * any values that we toast which were previously not toasted.)
906 : */
907 174 : NewHeap->rd_toastoid = OldHeap->rd_rel->reltoastrelid;
908 : }
909 : else
910 352 : *pSwapToastByContent = false;
911 :
912 : /*
913 : * Compute xids used to freeze and weed out dead tuples and multixacts.
914 : * Since we're going to rewrite the whole table anyway, there's no reason
915 : * not to be aggressive about this.
916 : */
917 526 : memset(¶ms, 0, sizeof(VacuumParams));
918 526 : vacuum_get_cutoffs(OldHeap, ¶ms, &cutoffs);
919 :
920 : /*
921 : * FreezeXid will become the table's new relfrozenxid, and that mustn't go
922 : * backwards, so take the max.
923 : */
924 1052 : if (TransactionIdIsValid(OldHeap->rd_rel->relfrozenxid) &&
925 526 : TransactionIdPrecedes(cutoffs.FreezeLimit,
926 526 : OldHeap->rd_rel->relfrozenxid))
927 106 : cutoffs.FreezeLimit = OldHeap->rd_rel->relfrozenxid;
928 :
929 : /*
930 : * MultiXactCutoff, similarly, shouldn't go backwards either.
931 : */
932 1052 : if (MultiXactIdIsValid(OldHeap->rd_rel->relminmxid) &&
933 526 : MultiXactIdPrecedes(cutoffs.MultiXactCutoff,
934 526 : OldHeap->rd_rel->relminmxid))
935 0 : cutoffs.MultiXactCutoff = OldHeap->rd_rel->relminmxid;
936 :
937 : /*
938 : * Decide whether to use an indexscan or seqscan-and-optional-sort to scan
939 : * the OldHeap. We know how to use a sort to duplicate the ordering of a
940 : * btree index, and will use seqscan-and-sort for that case if the planner
941 : * tells us it's cheaper. Otherwise, always indexscan if an index is
942 : * provided, else plain seqscan.
943 : */
944 526 : if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID)
945 198 : use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex);
946 : else
947 328 : use_sort = false;
948 :
949 : /* Log what we're doing */
950 526 : if (OldIndex != NULL && !use_sort)
951 96 : ereport(elevel,
952 : (errmsg("clustering \"%s.%s\" using index scan on \"%s\"",
953 : nspname,
954 : RelationGetRelationName(OldHeap),
955 : RelationGetRelationName(OldIndex))));
956 430 : else if (use_sort)
957 102 : ereport(elevel,
958 : (errmsg("clustering \"%s.%s\" using sequential scan and sort",
959 : nspname,
960 : RelationGetRelationName(OldHeap))));
961 : else
962 328 : ereport(elevel,
963 : (errmsg("vacuuming \"%s.%s\"",
964 : nspname,
965 : RelationGetRelationName(OldHeap))));
966 :
967 : /*
968 : * Hand off the actual copying to AM specific function, the generic code
969 : * cannot know how to deal with visibility across AMs. Note that this
970 : * routine is allowed to set FreezeXid / MultiXactCutoff to different
971 : * values (e.g. because the AM doesn't use freezing).
972 : */
973 526 : table_relation_copy_for_cluster(OldHeap, NewHeap, OldIndex, use_sort,
974 : cutoffs.OldestXmin, &cutoffs.FreezeLimit,
975 : &cutoffs.MultiXactCutoff,
976 : &num_tuples, &tups_vacuumed,
977 : &tups_recently_dead);
978 :
979 : /* return selected values to caller, get set as relfrozenxid/minmxid */
980 526 : *pFreezeXid = cutoffs.FreezeLimit;
981 526 : *pCutoffMulti = cutoffs.MultiXactCutoff;
982 :
983 : /* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */
984 526 : NewHeap->rd_toastoid = InvalidOid;
985 :
986 526 : num_pages = RelationGetNumberOfBlocks(NewHeap);
987 :
988 : /* Log what we did */
989 526 : ereport(elevel,
990 : (errmsg("\"%s.%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
991 : nspname,
992 : RelationGetRelationName(OldHeap),
993 : tups_vacuumed, num_tuples,
994 : RelationGetNumberOfBlocks(OldHeap)),
995 : errdetail("%.0f dead row versions cannot be removed yet.\n"
996 : "%s.",
997 : tups_recently_dead,
998 : pg_rusage_show(&ru0))));
999 :
1000 526 : if (OldIndex != NULL)
1001 198 : index_close(OldIndex, NoLock);
1002 526 : table_close(OldHeap, NoLock);
1003 526 : table_close(NewHeap, NoLock);
1004 :
1005 : /* Update pg_class to reflect the correct values of pages and tuples. */
1006 526 : relRelation = table_open(RelationRelationId, RowExclusiveLock);
1007 :
1008 526 : reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDNewHeap));
1009 526 : if (!HeapTupleIsValid(reltup))
1010 0 : elog(ERROR, "cache lookup failed for relation %u", OIDNewHeap);
1011 526 : relform = (Form_pg_class) GETSTRUCT(reltup);
1012 :
1013 526 : relform->relpages = num_pages;
1014 526 : relform->reltuples = num_tuples;
1015 :
1016 : /* Don't update the stats for pg_class. See swap_relation_files. */
1017 526 : if (OIDOldHeap != RelationRelationId)
1018 502 : CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
1019 : else
1020 24 : CacheInvalidateRelcacheByTuple(reltup);
1021 :
1022 : /* Clean up. */
1023 526 : heap_freetuple(reltup);
1024 526 : table_close(relRelation, RowExclusiveLock);
1025 :
1026 : /* Make the update visible */
1027 526 : CommandCounterIncrement();
1028 526 : }
1029 :
1030 : /*
1031 : * Swap the physical files of two given relations.
1032 : *
1033 : * We swap the physical identity (reltablespace, relfilenumber) while keeping
1034 : * the same logical identities of the two relations. relpersistence is also
1035 : * swapped, which is critical since it determines where buffers live for each
1036 : * relation.
1037 : *
1038 : * We can swap associated TOAST data in either of two ways: recursively swap
1039 : * the physical content of the toast tables (and their indexes), or swap the
1040 : * TOAST links in the given relations' pg_class entries. The former is needed
1041 : * to manage rewrites of shared catalogs (where we cannot change the pg_class
1042 : * links) while the latter is the only way to handle cases in which a toast
1043 : * table is added or removed altogether.
1044 : *
1045 : * Additionally, the first relation is marked with relfrozenxid set to
1046 : * frozenXid. It seems a bit ugly to have this here, but the caller would
1047 : * have to do it anyway, so having it here saves a heap_update. Note: in
1048 : * the swap-toast-links case, we assume we don't need to change the toast
1049 : * table's relfrozenxid: the new version of the toast table should already
1050 : * have relfrozenxid set to RecentXmin, which is good enough.
1051 : *
1052 : * Lastly, if r2 and its toast table and toast index (if any) are mapped,
1053 : * their OIDs are emitted into mapped_tables[]. This is hacky but beats
1054 : * having to look the information up again later in finish_heap_swap.
1055 : */
1056 : static void
1057 1722 : swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
1058 : bool swap_toast_by_content,
1059 : bool is_internal,
1060 : TransactionId frozenXid,
1061 : MultiXactId cutoffMulti,
1062 : Oid *mapped_tables)
1063 : {
1064 : Relation relRelation;
1065 : HeapTuple reltup1,
1066 : reltup2;
1067 : Form_pg_class relform1,
1068 : relform2;
1069 : RelFileNumber relfilenumber1,
1070 : relfilenumber2;
1071 : RelFileNumber swaptemp;
1072 : char swptmpchr;
1073 :
1074 : /* We need writable copies of both pg_class tuples. */
1075 1722 : relRelation = table_open(RelationRelationId, RowExclusiveLock);
1076 :
1077 1722 : reltup1 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r1));
1078 1722 : if (!HeapTupleIsValid(reltup1))
1079 0 : elog(ERROR, "cache lookup failed for relation %u", r1);
1080 1722 : relform1 = (Form_pg_class) GETSTRUCT(reltup1);
1081 :
1082 1722 : reltup2 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r2));
1083 1722 : if (!HeapTupleIsValid(reltup2))
1084 0 : elog(ERROR, "cache lookup failed for relation %u", r2);
1085 1722 : relform2 = (Form_pg_class) GETSTRUCT(reltup2);
1086 :
1087 1722 : relfilenumber1 = relform1->relfilenode;
1088 1722 : relfilenumber2 = relform2->relfilenode;
1089 :
1090 1722 : if (RelFileNumberIsValid(relfilenumber1) &&
1091 : RelFileNumberIsValid(relfilenumber2))
1092 : {
1093 : /*
1094 : * Normal non-mapped relations: swap relfilenumbers, reltablespaces,
1095 : * relpersistence
1096 : */
1097 : Assert(!target_is_pg_class);
1098 :
1099 1566 : swaptemp = relform1->relfilenode;
1100 1566 : relform1->relfilenode = relform2->relfilenode;
1101 1566 : relform2->relfilenode = swaptemp;
1102 :
1103 1566 : swaptemp = relform1->reltablespace;
1104 1566 : relform1->reltablespace = relform2->reltablespace;
1105 1566 : relform2->reltablespace = swaptemp;
1106 :
1107 1566 : swaptemp = relform1->relam;
1108 1566 : relform1->relam = relform2->relam;
1109 1566 : relform2->relam = swaptemp;
1110 :
1111 1566 : swptmpchr = relform1->relpersistence;
1112 1566 : relform1->relpersistence = relform2->relpersistence;
1113 1566 : relform2->relpersistence = swptmpchr;
1114 :
1115 : /* Also swap toast links, if we're swapping by links */
1116 1566 : if (!swap_toast_by_content)
1117 : {
1118 1170 : swaptemp = relform1->reltoastrelid;
1119 1170 : relform1->reltoastrelid = relform2->reltoastrelid;
1120 1170 : relform2->reltoastrelid = swaptemp;
1121 : }
1122 : }
1123 : else
1124 : {
1125 : /*
1126 : * Mapped-relation case. Here we have to swap the relation mappings
1127 : * instead of modifying the pg_class columns. Both must be mapped.
1128 : */
1129 156 : if (RelFileNumberIsValid(relfilenumber1) ||
1130 : RelFileNumberIsValid(relfilenumber2))
1131 0 : elog(ERROR, "cannot swap mapped relation \"%s\" with non-mapped relation",
1132 : NameStr(relform1->relname));
1133 :
1134 : /*
1135 : * We can't change the tablespace nor persistence of a mapped rel, and
1136 : * we can't handle toast link swapping for one either, because we must
1137 : * not apply any critical changes to its pg_class row. These cases
1138 : * should be prevented by upstream permissions tests, so these checks
1139 : * are non-user-facing emergency backstop.
1140 : */
1141 156 : if (relform1->reltablespace != relform2->reltablespace)
1142 0 : elog(ERROR, "cannot change tablespace of mapped relation \"%s\"",
1143 : NameStr(relform1->relname));
1144 156 : if (relform1->relpersistence != relform2->relpersistence)
1145 0 : elog(ERROR, "cannot change persistence of mapped relation \"%s\"",
1146 : NameStr(relform1->relname));
1147 156 : if (relform1->relam != relform2->relam)
1148 0 : elog(ERROR, "cannot change access method of mapped relation \"%s\"",
1149 : NameStr(relform1->relname));
1150 156 : if (!swap_toast_by_content &&
1151 30 : (relform1->reltoastrelid || relform2->reltoastrelid))
1152 0 : elog(ERROR, "cannot swap toast by links for mapped relation \"%s\"",
1153 : NameStr(relform1->relname));
1154 :
1155 : /*
1156 : * Fetch the mappings --- shouldn't fail, but be paranoid
1157 : */
1158 156 : relfilenumber1 = RelationMapOidToFilenumber(r1, relform1->relisshared);
1159 156 : if (!RelFileNumberIsValid(relfilenumber1))
1160 0 : elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1161 : NameStr(relform1->relname), r1);
1162 156 : relfilenumber2 = RelationMapOidToFilenumber(r2, relform2->relisshared);
1163 156 : if (!RelFileNumberIsValid(relfilenumber2))
1164 0 : elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1165 : NameStr(relform2->relname), r2);
1166 :
1167 : /*
1168 : * Send replacement mappings to relmapper. Note these won't actually
1169 : * take effect until CommandCounterIncrement.
1170 : */
1171 156 : RelationMapUpdateMap(r1, relfilenumber2, relform1->relisshared, false);
1172 156 : RelationMapUpdateMap(r2, relfilenumber1, relform2->relisshared, false);
1173 :
1174 : /* Pass OIDs of mapped r2 tables back to caller */
1175 156 : *mapped_tables++ = r2;
1176 : }
1177 :
1178 : /*
1179 : * Recognize that rel1's relfilenumber (swapped from rel2) is new in this
1180 : * subtransaction. The rel2 storage (swapped from rel1) may or may not be
1181 : * new.
1182 : */
1183 : {
1184 : Relation rel1,
1185 : rel2;
1186 :
1187 1722 : rel1 = relation_open(r1, NoLock);
1188 1722 : rel2 = relation_open(r2, NoLock);
1189 1722 : rel2->rd_createSubid = rel1->rd_createSubid;
1190 1722 : rel2->rd_newRelfilelocatorSubid = rel1->rd_newRelfilelocatorSubid;
1191 1722 : rel2->rd_firstRelfilelocatorSubid = rel1->rd_firstRelfilelocatorSubid;
1192 1722 : RelationAssumeNewRelfilelocator(rel1);
1193 1722 : relation_close(rel1, NoLock);
1194 1722 : relation_close(rel2, NoLock);
1195 : }
1196 :
1197 : /*
1198 : * In the case of a shared catalog, these next few steps will only affect
1199 : * our own database's pg_class row; but that's okay, because they are all
1200 : * noncritical updates. That's also an important fact for the case of a
1201 : * mapped catalog, because it's possible that we'll commit the map change
1202 : * and then fail to commit the pg_class update.
1203 : */
1204 :
1205 : /* set rel1's frozen Xid and minimum MultiXid */
1206 1722 : if (relform1->relkind != RELKIND_INDEX)
1207 : {
1208 : Assert(!TransactionIdIsValid(frozenXid) ||
1209 : TransactionIdIsNormal(frozenXid));
1210 1548 : relform1->relfrozenxid = frozenXid;
1211 1548 : relform1->relminmxid = cutoffMulti;
1212 : }
1213 :
1214 : /* swap size statistics too, since new rel has freshly-updated stats */
1215 : {
1216 : int32 swap_pages;
1217 : float4 swap_tuples;
1218 : int32 swap_allvisible;
1219 :
1220 1722 : swap_pages = relform1->relpages;
1221 1722 : relform1->relpages = relform2->relpages;
1222 1722 : relform2->relpages = swap_pages;
1223 :
1224 1722 : swap_tuples = relform1->reltuples;
1225 1722 : relform1->reltuples = relform2->reltuples;
1226 1722 : relform2->reltuples = swap_tuples;
1227 :
1228 1722 : swap_allvisible = relform1->relallvisible;
1229 1722 : relform1->relallvisible = relform2->relallvisible;
1230 1722 : relform2->relallvisible = swap_allvisible;
1231 : }
1232 :
1233 : /*
1234 : * Update the tuples in pg_class --- unless the target relation of the
1235 : * swap is pg_class itself. In that case, there is zero point in making
1236 : * changes because we'd be updating the old data that we're about to throw
1237 : * away. Because the real work being done here for a mapped relation is
1238 : * just to change the relation map settings, it's all right to not update
1239 : * the pg_class rows in this case. The most important changes will instead
1240 : * performed later, in finish_heap_swap() itself.
1241 : */
1242 1722 : if (!target_is_pg_class)
1243 : {
1244 : CatalogIndexState indstate;
1245 :
1246 1698 : indstate = CatalogOpenIndexes(relRelation);
1247 1698 : CatalogTupleUpdateWithInfo(relRelation, &reltup1->t_self, reltup1,
1248 : indstate);
1249 1698 : CatalogTupleUpdateWithInfo(relRelation, &reltup2->t_self, reltup2,
1250 : indstate);
1251 1698 : CatalogCloseIndexes(indstate);
1252 : }
1253 : else
1254 : {
1255 : /* no update ... but we do still need relcache inval */
1256 24 : CacheInvalidateRelcacheByTuple(reltup1);
1257 24 : CacheInvalidateRelcacheByTuple(reltup2);
1258 : }
1259 :
1260 : /*
1261 : * Post alter hook for modified relations. The change to r2 is always
1262 : * internal, but r1 depends on the invocation context.
1263 : */
1264 1722 : InvokeObjectPostAlterHookArg(RelationRelationId, r1, 0,
1265 : InvalidOid, is_internal);
1266 1722 : InvokeObjectPostAlterHookArg(RelationRelationId, r2, 0,
1267 : InvalidOid, true);
1268 :
1269 : /*
1270 : * If we have toast tables associated with the relations being swapped,
1271 : * deal with them too.
1272 : */
1273 1722 : if (relform1->reltoastrelid || relform2->reltoastrelid)
1274 : {
1275 546 : if (swap_toast_by_content)
1276 : {
1277 174 : if (relform1->reltoastrelid && relform2->reltoastrelid)
1278 : {
1279 : /* Recursively swap the contents of the toast tables */
1280 174 : swap_relation_files(relform1->reltoastrelid,
1281 : relform2->reltoastrelid,
1282 : target_is_pg_class,
1283 : swap_toast_by_content,
1284 : is_internal,
1285 : frozenXid,
1286 : cutoffMulti,
1287 : mapped_tables);
1288 : }
1289 : else
1290 : {
1291 : /* caller messed up */
1292 0 : elog(ERROR, "cannot swap toast files by content when there's only one");
1293 : }
1294 : }
1295 : else
1296 : {
1297 : /*
1298 : * We swapped the ownership links, so we need to change dependency
1299 : * data to match.
1300 : *
1301 : * NOTE: it is possible that only one table has a toast table.
1302 : *
1303 : * NOTE: at present, a TOAST table's only dependency is the one on
1304 : * its owning table. If more are ever created, we'd need to use
1305 : * something more selective than deleteDependencyRecordsFor() to
1306 : * get rid of just the link we want.
1307 : */
1308 : ObjectAddress baseobject,
1309 : toastobject;
1310 : long count;
1311 :
1312 : /*
1313 : * We disallow this case for system catalogs, to avoid the
1314 : * possibility that the catalog we're rebuilding is one of the
1315 : * ones the dependency changes would change. It's too late to be
1316 : * making any data changes to the target catalog.
1317 : */
1318 372 : if (IsSystemClass(r1, relform1))
1319 0 : elog(ERROR, "cannot swap toast files by links for system catalogs");
1320 :
1321 : /* Delete old dependencies */
1322 372 : if (relform1->reltoastrelid)
1323 : {
1324 340 : count = deleteDependencyRecordsFor(RelationRelationId,
1325 : relform1->reltoastrelid,
1326 : false);
1327 340 : if (count != 1)
1328 0 : elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1329 : count);
1330 : }
1331 372 : if (relform2->reltoastrelid)
1332 : {
1333 372 : count = deleteDependencyRecordsFor(RelationRelationId,
1334 : relform2->reltoastrelid,
1335 : false);
1336 372 : if (count != 1)
1337 0 : elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1338 : count);
1339 : }
1340 :
1341 : /* Register new dependencies */
1342 372 : baseobject.classId = RelationRelationId;
1343 372 : baseobject.objectSubId = 0;
1344 372 : toastobject.classId = RelationRelationId;
1345 372 : toastobject.objectSubId = 0;
1346 :
1347 372 : if (relform1->reltoastrelid)
1348 : {
1349 340 : baseobject.objectId = r1;
1350 340 : toastobject.objectId = relform1->reltoastrelid;
1351 340 : recordDependencyOn(&toastobject, &baseobject,
1352 : DEPENDENCY_INTERNAL);
1353 : }
1354 :
1355 372 : if (relform2->reltoastrelid)
1356 : {
1357 372 : baseobject.objectId = r2;
1358 372 : toastobject.objectId = relform2->reltoastrelid;
1359 372 : recordDependencyOn(&toastobject, &baseobject,
1360 : DEPENDENCY_INTERNAL);
1361 : }
1362 : }
1363 : }
1364 :
1365 : /*
1366 : * If we're swapping two toast tables by content, do the same for their
1367 : * valid index. The swap can actually be safely done only if the relations
1368 : * have indexes.
1369 : */
1370 1722 : if (swap_toast_by_content &&
1371 522 : relform1->relkind == RELKIND_TOASTVALUE &&
1372 174 : relform2->relkind == RELKIND_TOASTVALUE)
1373 : {
1374 : Oid toastIndex1,
1375 : toastIndex2;
1376 :
1377 : /* Get valid index for each relation */
1378 174 : toastIndex1 = toast_get_valid_index(r1,
1379 : AccessExclusiveLock);
1380 174 : toastIndex2 = toast_get_valid_index(r2,
1381 : AccessExclusiveLock);
1382 :
1383 174 : swap_relation_files(toastIndex1,
1384 : toastIndex2,
1385 : target_is_pg_class,
1386 : swap_toast_by_content,
1387 : is_internal,
1388 : InvalidTransactionId,
1389 : InvalidMultiXactId,
1390 : mapped_tables);
1391 : }
1392 :
1393 : /* Clean up. */
1394 1722 : heap_freetuple(reltup1);
1395 1722 : heap_freetuple(reltup2);
1396 :
1397 1722 : table_close(relRelation, RowExclusiveLock);
1398 :
1399 : /*
1400 : * Close both relcache entries' smgr links. We need this kluge because
1401 : * both links will be invalidated during upcoming CommandCounterIncrement.
1402 : * Whichever of the rels is the second to be cleared will have a dangling
1403 : * reference to the other's smgr entry. Rather than trying to avoid this
1404 : * by ordering operations just so, it's easiest to close the links first.
1405 : * (Fortunately, since one of the entries is local in our transaction,
1406 : * it's sufficient to clear out our own relcache this way; the problem
1407 : * cannot arise for other backends when they see our update on the
1408 : * non-transient relation.)
1409 : *
1410 : * Caution: the placement of this step interacts with the decision to
1411 : * handle toast rels by recursion. When we are trying to rebuild pg_class
1412 : * itself, the smgr close on pg_class must happen after all accesses in
1413 : * this function.
1414 : */
1415 1722 : RelationCloseSmgrByOid(r1);
1416 1722 : RelationCloseSmgrByOid(r2);
1417 1722 : }
1418 :
1419 : /*
1420 : * Remove the transient table that was built by make_new_heap, and finish
1421 : * cleaning up (including rebuilding all indexes on the old heap).
1422 : */
1423 : void
1424 1374 : finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
1425 : bool is_system_catalog,
1426 : bool swap_toast_by_content,
1427 : bool check_constraints,
1428 : bool is_internal,
1429 : TransactionId frozenXid,
1430 : MultiXactId cutoffMulti,
1431 : char newrelpersistence)
1432 : {
1433 : ObjectAddress object;
1434 : Oid mapped_tables[4];
1435 : int reindex_flags;
1436 1374 : ReindexParams reindex_params = {0};
1437 : int i;
1438 :
1439 : /* Report that we are now swapping relation files */
1440 1374 : pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1441 : PROGRESS_CLUSTER_PHASE_SWAP_REL_FILES);
1442 :
1443 : /* Zero out possible results from swapped_relation_files */
1444 1374 : memset(mapped_tables, 0, sizeof(mapped_tables));
1445 :
1446 : /*
1447 : * Swap the contents of the heap relations (including any toast tables).
1448 : * Also set old heap's relfrozenxid to frozenXid.
1449 : */
1450 1374 : swap_relation_files(OIDOldHeap, OIDNewHeap,
1451 : (OIDOldHeap == RelationRelationId),
1452 : swap_toast_by_content, is_internal,
1453 : frozenXid, cutoffMulti, mapped_tables);
1454 :
1455 : /*
1456 : * If it's a system catalog, queue a sinval message to flush all catcaches
1457 : * on the catalog when we reach CommandCounterIncrement.
1458 : */
1459 1374 : if (is_system_catalog)
1460 200 : CacheInvalidateCatalog(OIDOldHeap);
1461 :
1462 : /*
1463 : * Rebuild each index on the relation (but not the toast table, which is
1464 : * all-new at this point). It is important to do this before the DROP
1465 : * step because if we are processing a system catalog that will be used
1466 : * during DROP, we want to have its indexes available. There is no
1467 : * advantage to the other order anyway because this is all transactional,
1468 : * so no chance to reclaim disk space before commit. We do not need a
1469 : * final CommandCounterIncrement() because reindex_relation does it.
1470 : *
1471 : * Note: because index_build is called via reindex_relation, it will never
1472 : * set indcheckxmin true for the indexes. This is OK even though in some
1473 : * sense we are building new indexes rather than rebuilding existing ones,
1474 : * because the new heap won't contain any HOT chains at all, let alone
1475 : * broken ones, so it can't be necessary to set indcheckxmin.
1476 : */
1477 1374 : reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE;
1478 1374 : if (check_constraints)
1479 848 : reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS;
1480 :
1481 : /*
1482 : * Ensure that the indexes have the same persistence as the parent
1483 : * relation.
1484 : */
1485 1374 : if (newrelpersistence == RELPERSISTENCE_UNLOGGED)
1486 20 : reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED;
1487 1354 : else if (newrelpersistence == RELPERSISTENCE_PERMANENT)
1488 1280 : reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT;
1489 :
1490 : /* Report that we are now reindexing relations */
1491 1374 : pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1492 : PROGRESS_CLUSTER_PHASE_REBUILD_INDEX);
1493 :
1494 1374 : reindex_relation(OIDOldHeap, reindex_flags, &reindex_params);
1495 :
1496 : /* Report that we are now doing clean up */
1497 1356 : pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1498 : PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP);
1499 :
1500 : /*
1501 : * If the relation being rebuilt is pg_class, swap_relation_files()
1502 : * couldn't update pg_class's own pg_class entry (check comments in
1503 : * swap_relation_files()), thus relfrozenxid was not updated. That's
1504 : * annoying because a potential reason for doing a VACUUM FULL is a
1505 : * imminent or actual anti-wraparound shutdown. So, now that we can
1506 : * access the new relation using its indices, update relfrozenxid.
1507 : * pg_class doesn't have a toast relation, so we don't need to update the
1508 : * corresponding toast relation. Not that there's little point moving all
1509 : * relfrozenxid updates here since swap_relation_files() needs to write to
1510 : * pg_class for non-mapped relations anyway.
1511 : */
1512 1356 : if (OIDOldHeap == RelationRelationId)
1513 : {
1514 : Relation relRelation;
1515 : HeapTuple reltup;
1516 : Form_pg_class relform;
1517 :
1518 24 : relRelation = table_open(RelationRelationId, RowExclusiveLock);
1519 :
1520 24 : reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDOldHeap));
1521 24 : if (!HeapTupleIsValid(reltup))
1522 0 : elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
1523 24 : relform = (Form_pg_class) GETSTRUCT(reltup);
1524 :
1525 24 : relform->relfrozenxid = frozenXid;
1526 24 : relform->relminmxid = cutoffMulti;
1527 :
1528 24 : CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
1529 :
1530 24 : table_close(relRelation, RowExclusiveLock);
1531 : }
1532 :
1533 : /* Destroy new heap with old filenumber */
1534 1356 : object.classId = RelationRelationId;
1535 1356 : object.objectId = OIDNewHeap;
1536 1356 : object.objectSubId = 0;
1537 :
1538 : /*
1539 : * The new relation is local to our transaction and we know nothing
1540 : * depends on it, so DROP_RESTRICT should be OK.
1541 : */
1542 1356 : performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
1543 :
1544 : /* performDeletion does CommandCounterIncrement at end */
1545 :
1546 : /*
1547 : * Now we must remove any relation mapping entries that we set up for the
1548 : * transient table, as well as its toast table and toast index if any. If
1549 : * we fail to do this before commit, the relmapper will complain about new
1550 : * permanent map entries being added post-bootstrap.
1551 : */
1552 1512 : for (i = 0; OidIsValid(mapped_tables[i]); i++)
1553 156 : RelationMapRemoveMapping(mapped_tables[i]);
1554 :
1555 : /*
1556 : * At this point, everything is kosher except that, if we did toast swap
1557 : * by links, the toast table's name corresponds to the transient table.
1558 : * The name is irrelevant to the backend because it's referenced by OID,
1559 : * but users looking at the catalogs could be confused. Rename it to
1560 : * prevent this problem.
1561 : *
1562 : * Note no lock required on the relation, because we already hold an
1563 : * exclusive lock on it.
1564 : */
1565 1356 : if (!swap_toast_by_content)
1566 : {
1567 : Relation newrel;
1568 :
1569 1182 : newrel = table_open(OIDOldHeap, NoLock);
1570 1182 : if (OidIsValid(newrel->rd_rel->reltoastrelid))
1571 : {
1572 : Oid toastidx;
1573 : char NewToastName[NAMEDATALEN];
1574 :
1575 : /* Get the associated valid index to be renamed */
1576 340 : toastidx = toast_get_valid_index(newrel->rd_rel->reltoastrelid,
1577 : NoLock);
1578 :
1579 : /* rename the toast table ... */
1580 340 : snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u",
1581 : OIDOldHeap);
1582 340 : RenameRelationInternal(newrel->rd_rel->reltoastrelid,
1583 : NewToastName, true, false);
1584 :
1585 : /* ... and its valid index too. */
1586 340 : snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index",
1587 : OIDOldHeap);
1588 :
1589 340 : RenameRelationInternal(toastidx,
1590 : NewToastName, true, true);
1591 :
1592 : /*
1593 : * Reset the relrewrite for the toast. The command-counter
1594 : * increment is required here as we are about to update the tuple
1595 : * that is updated as part of RenameRelationInternal.
1596 : */
1597 340 : CommandCounterIncrement();
1598 340 : ResetRelRewrite(newrel->rd_rel->reltoastrelid);
1599 : }
1600 1182 : relation_close(newrel, NoLock);
1601 : }
1602 :
1603 : /* if it's not a catalog table, clear any missing attribute settings */
1604 1356 : if (!is_system_catalog)
1605 : {
1606 : Relation newrel;
1607 :
1608 1156 : newrel = table_open(OIDOldHeap, NoLock);
1609 1156 : RelationClearMissing(newrel);
1610 1156 : relation_close(newrel, NoLock);
1611 : }
1612 1356 : }
1613 :
1614 :
1615 : /*
1616 : * Get a list of tables that the current user has privileges on and
1617 : * have indisclustered set. Return the list in a List * of RelToCluster
1618 : * (stored in the specified memory context), each one giving the tableOid
1619 : * and the indexOid on which the table is already clustered.
1620 : */
1621 : static List *
1622 28 : get_tables_to_cluster(MemoryContext cluster_context)
1623 : {
1624 : Relation indRelation;
1625 : TableScanDesc scan;
1626 : ScanKeyData entry;
1627 : HeapTuple indexTuple;
1628 : Form_pg_index index;
1629 : MemoryContext old_context;
1630 28 : List *rtcs = NIL;
1631 :
1632 : /*
1633 : * Get all indexes that have indisclustered set and that the current user
1634 : * has the appropriate privileges for.
1635 : */
1636 28 : indRelation = table_open(IndexRelationId, AccessShareLock);
1637 28 : ScanKeyInit(&entry,
1638 : Anum_pg_index_indisclustered,
1639 : BTEqualStrategyNumber, F_BOOLEQ,
1640 : BoolGetDatum(true));
1641 28 : scan = table_beginscan_catalog(indRelation, 1, &entry);
1642 46 : while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1643 : {
1644 : RelToCluster *rtc;
1645 :
1646 18 : index = (Form_pg_index) GETSTRUCT(indexTuple);
1647 :
1648 18 : if (!cluster_is_permitted_for_relation(index->indrelid, GetUserId()))
1649 12 : continue;
1650 :
1651 : /* Use a permanent memory context for the result list */
1652 6 : old_context = MemoryContextSwitchTo(cluster_context);
1653 :
1654 6 : rtc = (RelToCluster *) palloc(sizeof(RelToCluster));
1655 6 : rtc->tableOid = index->indrelid;
1656 6 : rtc->indexOid = index->indexrelid;
1657 6 : rtcs = lappend(rtcs, rtc);
1658 :
1659 6 : MemoryContextSwitchTo(old_context);
1660 : }
1661 28 : table_endscan(scan);
1662 :
1663 28 : relation_close(indRelation, AccessShareLock);
1664 :
1665 28 : return rtcs;
1666 : }
1667 :
1668 : /*
1669 : * Given an index on a partitioned table, return a list of RelToCluster for
1670 : * all the children leaves tables/indexes.
1671 : *
1672 : * Like expand_vacuum_rel, but here caller must hold AccessExclusiveLock
1673 : * on the table containing the index.
1674 : */
1675 : static List *
1676 20 : get_tables_to_cluster_partitioned(MemoryContext cluster_context, Oid indexOid)
1677 : {
1678 : List *inhoids;
1679 : ListCell *lc;
1680 20 : List *rtcs = NIL;
1681 : MemoryContext old_context;
1682 :
1683 : /* Do not lock the children until they're processed */
1684 20 : inhoids = find_all_inheritors(indexOid, NoLock, NULL);
1685 :
1686 104 : foreach(lc, inhoids)
1687 : {
1688 84 : Oid indexrelid = lfirst_oid(lc);
1689 84 : Oid relid = IndexGetRelation(indexrelid, false);
1690 : RelToCluster *rtc;
1691 :
1692 : /* consider only leaf indexes */
1693 84 : if (get_rel_relkind(indexrelid) != RELKIND_INDEX)
1694 38 : continue;
1695 :
1696 : /*
1697 : * We already checked that the user has privileges to CLUSTER the
1698 : * partitioned table when we locked it earlier, so there's no need to
1699 : * check the privileges again here.
1700 : */
1701 :
1702 : /* Use a permanent memory context for the result list */
1703 46 : old_context = MemoryContextSwitchTo(cluster_context);
1704 :
1705 46 : rtc = (RelToCluster *) palloc(sizeof(RelToCluster));
1706 46 : rtc->tableOid = relid;
1707 46 : rtc->indexOid = indexrelid;
1708 46 : rtcs = lappend(rtcs, rtc);
1709 :
1710 46 : MemoryContextSwitchTo(old_context);
1711 : }
1712 :
1713 20 : return rtcs;
1714 : }
1715 :
1716 : /*
1717 : * Return whether userid has privileges to CLUSTER relid. If not, this
1718 : * function emits a WARNING.
1719 : */
1720 : static bool
1721 70 : cluster_is_permitted_for_relation(Oid relid, Oid userid)
1722 : {
1723 104 : if (pg_class_aclcheck(relid, userid, ACL_MAINTAIN) == ACLCHECK_OK ||
1724 34 : has_partition_ancestor_privs(relid, userid, ACL_MAINTAIN))
1725 58 : return true;
1726 :
1727 12 : ereport(WARNING,
1728 : (errmsg("permission denied to cluster \"%s\", skipping it",
1729 : get_rel_name(relid))));
1730 12 : return false;
1731 : }
|