Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * relmapper.c
4 : * Catalog-to-filenumber mapping
5 : *
6 : * For most tables, the physical file underlying the table is specified by
7 : * pg_class.relfilenode. However, that obviously won't work for pg_class
8 : * itself, nor for the other "nailed" catalogs for which we have to be able
9 : * to set up working Relation entries without access to pg_class. It also
10 : * does not work for shared catalogs, since there is no practical way to
11 : * update other databases' pg_class entries when relocating a shared catalog.
12 : * Therefore, for these special catalogs (henceforth referred to as "mapped
13 : * catalogs") we rely on a separately maintained file that shows the mapping
14 : * from catalog OIDs to filenumbers. Each database has a map file for
15 : * its local mapped catalogs, and there is a separate map file for shared
16 : * catalogs. Mapped catalogs have zero in their pg_class.relfilenode entries.
17 : *
18 : * Relocation of a normal table is committed (ie, the new physical file becomes
19 : * authoritative) when the pg_class row update commits. For mapped catalogs,
20 : * the act of updating the map file is effectively commit of the relocation.
21 : * We postpone the file update till just before commit of the transaction
22 : * doing the rewrite, but there is necessarily a window between. Therefore
23 : * mapped catalogs can only be relocated by operations such as VACUUM FULL
24 : * and CLUSTER, which make no transactionally-significant changes: it must be
25 : * safe for the new file to replace the old, even if the transaction itself
26 : * aborts. An important factor here is that the indexes and toast table of
27 : * a mapped catalog must also be mapped, so that the rewrites/relocations of
28 : * all these files commit in a single map file update rather than being tied
29 : * to transaction commit.
30 : *
31 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
32 : * Portions Copyright (c) 1994, Regents of the University of California
33 : *
34 : *
35 : * IDENTIFICATION
36 : * src/backend/utils/cache/relmapper.c
37 : *
38 : *-------------------------------------------------------------------------
39 : */
40 : #include "postgres.h"
41 :
42 : #include <fcntl.h>
43 : #include <sys/stat.h>
44 : #include <unistd.h>
45 :
46 : #include "access/xact.h"
47 : #include "access/xlog.h"
48 : #include "access/xloginsert.h"
49 : #include "catalog/pg_tablespace.h"
50 : #include "catalog/storage.h"
51 : #include "miscadmin.h"
52 : #include "pgstat.h"
53 : #include "storage/fd.h"
54 : #include "storage/lwlock.h"
55 : #include "utils/inval.h"
56 : #include "utils/relmapper.h"
57 : #include "utils/wait_event.h"
58 :
59 :
60 : /*
61 : * The map file is critical data: we have no automatic method for recovering
62 : * from loss or corruption of it. We use a CRC so that we can detect
63 : * corruption. Since the file might be more than one standard-size disk
64 : * sector in size, we cannot rely on overwrite-in-place. Instead, we generate
65 : * a new file and rename it into place, atomically replacing the original file.
66 : *
67 : * Entries in the mappings[] array are in no particular order. We could
68 : * speed searching by insisting on OID order, but it really shouldn't be
69 : * worth the trouble given the intended size of the mapping sets.
70 : */
71 : #define RELMAPPER_FILENAME "pg_filenode.map"
72 : #define RELMAPPER_TEMP_FILENAME "pg_filenode.map.tmp"
73 :
74 : #define RELMAPPER_FILEMAGIC 0x592717 /* version ID value */
75 :
76 : /*
77 : * There's no need for this constant to have any particular value, and we
78 : * can raise it as necessary if we end up with more mapped relations. For
79 : * now, we just pick a round number that is modestly larger than the expected
80 : * number of mappings.
81 : */
82 : #define MAX_MAPPINGS 64
83 :
84 : typedef struct RelMapping
85 : {
86 : Oid mapoid; /* OID of a catalog */
87 : RelFileNumber mapfilenumber; /* its rel file number */
88 : } RelMapping;
89 :
90 : typedef struct RelMapFile
91 : {
92 : int32 magic; /* always RELMAPPER_FILEMAGIC */
93 : int32 num_mappings; /* number of valid RelMapping entries */
94 : RelMapping mappings[MAX_MAPPINGS];
95 : pg_crc32c crc; /* CRC of all above */
96 : } RelMapFile;
97 :
98 : /*
99 : * State for serializing local and shared relmappings for parallel workers
100 : * (active states only). See notes on active_* and pending_* updates state.
101 : */
102 : typedef struct SerializedActiveRelMaps
103 : {
104 : RelMapFile active_shared_updates;
105 : RelMapFile active_local_updates;
106 : } SerializedActiveRelMaps;
107 :
108 : /*
109 : * The currently known contents of the shared map file and our database's
110 : * local map file are stored here. These can be reloaded from disk
111 : * immediately whenever we receive an update sinval message.
112 : */
113 : static RelMapFile shared_map;
114 : static RelMapFile local_map;
115 :
116 : /*
117 : * We use the same RelMapFile data structure to track uncommitted local
118 : * changes in the mappings (but note the magic and crc fields are not made
119 : * valid in these variables). Currently, map updates are not allowed within
120 : * subtransactions, so one set of transaction-level changes is sufficient.
121 : *
122 : * The active_xxx variables contain updates that are valid in our transaction
123 : * and should be honored by RelationMapOidToFilenumber. The pending_xxx
124 : * variables contain updates we have been told about that aren't active yet;
125 : * they will become active at the next CommandCounterIncrement. This setup
126 : * lets map updates act similarly to updates of pg_class rows, ie, they
127 : * become visible only at the next CommandCounterIncrement boundary.
128 : *
129 : * Active shared and active local updates are serialized by the parallel
130 : * infrastructure, and deserialized within parallel workers.
131 : */
132 : static RelMapFile active_shared_updates;
133 : static RelMapFile active_local_updates;
134 : static RelMapFile pending_shared_updates;
135 : static RelMapFile pending_local_updates;
136 :
137 :
138 : /* non-export function prototypes */
139 : static void apply_map_update(RelMapFile *map, Oid relationId,
140 : RelFileNumber fileNumber, bool add_okay);
141 : static void merge_map_updates(RelMapFile *map, const RelMapFile *updates,
142 : bool add_okay);
143 : static void load_relmap_file(bool shared, bool lock_held);
144 : static void read_relmap_file(RelMapFile *map, char *dbpath, bool lock_held,
145 : int elevel);
146 : static void write_relmap_file(RelMapFile *newmap, bool write_wal,
147 : bool send_sinval, bool preserve_files,
148 : Oid dbid, Oid tsid, const char *dbpath);
149 : static void perform_relmap_update(bool shared, const RelMapFile *updates);
150 :
151 :
152 : /*
153 : * RelationMapOidToFilenumber
154 : *
155 : * The raison d' etre ... given a relation OID, look up its filenumber.
156 : *
157 : * Although shared and local relation OIDs should never overlap, the caller
158 : * always knows which we need --- so pass that information to avoid useless
159 : * searching.
160 : *
161 : * Returns InvalidRelFileNumber if the OID is not known (which should never
162 : * happen, but the caller is in a better position to report a meaningful
163 : * error).
164 : */
165 : RelFileNumber
166 831425 : RelationMapOidToFilenumber(Oid relationId, bool shared)
167 : {
168 : const RelMapFile *map;
169 : int32 i;
170 :
171 : /* If there are active updates, believe those over the main maps */
172 831425 : if (shared)
173 : {
174 514115 : map = &active_shared_updates;
175 515446 : for (i = 0; i < map->num_mappings; i++)
176 : {
177 2014 : if (relationId == map->mappings[i].mapoid)
178 683 : return map->mappings[i].mapfilenumber;
179 : }
180 513432 : map = &shared_map;
181 12130812 : for (i = 0; i < map->num_mappings; i++)
182 : {
183 12130812 : if (relationId == map->mappings[i].mapoid)
184 513432 : return map->mappings[i].mapfilenumber;
185 : }
186 : }
187 : else
188 : {
189 317310 : map = &active_local_updates;
190 319497 : for (i = 0; i < map->num_mappings; i++)
191 : {
192 3408 : if (relationId == map->mappings[i].mapoid)
193 1221 : return map->mappings[i].mapfilenumber;
194 : }
195 316089 : map = &local_map;
196 2370997 : for (i = 0; i < map->num_mappings; i++)
197 : {
198 2370997 : if (relationId == map->mappings[i].mapoid)
199 316089 : return map->mappings[i].mapfilenumber;
200 : }
201 : }
202 :
203 0 : return InvalidRelFileNumber;
204 : }
205 :
206 : /*
207 : * RelationMapFilenumberToOid
208 : *
209 : * Do the reverse of the normal direction of mapping done in
210 : * RelationMapOidToFilenumber.
211 : *
212 : * This is not supposed to be used during normal running but rather for
213 : * information purposes when looking at the filesystem or xlog.
214 : *
215 : * Returns InvalidOid if the OID is not known; this can easily happen if the
216 : * relfilenumber doesn't pertain to a mapped relation.
217 : */
218 : Oid
219 574 : RelationMapFilenumberToOid(RelFileNumber filenumber, bool shared)
220 : {
221 : const RelMapFile *map;
222 : int32 i;
223 :
224 : /* If there are active updates, believe those over the main maps */
225 574 : if (shared)
226 : {
227 151 : map = &active_shared_updates;
228 151 : for (i = 0; i < map->num_mappings; i++)
229 : {
230 0 : if (filenumber == map->mappings[i].mapfilenumber)
231 0 : return map->mappings[i].mapoid;
232 : }
233 151 : map = &shared_map;
234 3540 : for (i = 0; i < map->num_mappings; i++)
235 : {
236 3540 : if (filenumber == map->mappings[i].mapfilenumber)
237 151 : return map->mappings[i].mapoid;
238 : }
239 : }
240 : else
241 : {
242 423 : map = &active_local_updates;
243 423 : for (i = 0; i < map->num_mappings; i++)
244 : {
245 0 : if (filenumber == map->mappings[i].mapfilenumber)
246 0 : return map->mappings[i].mapoid;
247 : }
248 423 : map = &local_map;
249 2216 : for (i = 0; i < map->num_mappings; i++)
250 : {
251 2157 : if (filenumber == map->mappings[i].mapfilenumber)
252 364 : return map->mappings[i].mapoid;
253 : }
254 : }
255 :
256 59 : return InvalidOid;
257 : }
258 :
259 : /*
260 : * RelationMapOidToFilenumberForDatabase
261 : *
262 : * Like RelationMapOidToFilenumber, but reads the mapping from the indicated
263 : * path instead of using the one for the current database.
264 : */
265 : RelFileNumber
266 4698 : RelationMapOidToFilenumberForDatabase(char *dbpath, Oid relationId)
267 : {
268 : RelMapFile map;
269 : int i;
270 :
271 : /* Read the relmap file from the source database. */
272 4698 : read_relmap_file(&map, dbpath, false, ERROR);
273 :
274 : /* Iterate over the relmap entries to find the input relation OID. */
275 40194 : for (i = 0; i < map.num_mappings; i++)
276 : {
277 40194 : if (relationId == map.mappings[i].mapoid)
278 4698 : return map.mappings[i].mapfilenumber;
279 : }
280 :
281 0 : return InvalidRelFileNumber;
282 : }
283 :
284 : /*
285 : * RelationMapCopy
286 : *
287 : * Copy relmapfile from source db path to the destination db path and WAL log
288 : * the operation. This is intended for use in creating a new relmap file
289 : * for a database that doesn't have one yet, not for replacing an existing
290 : * relmap file.
291 : */
292 : void
293 261 : RelationMapCopy(Oid dbid, Oid tsid, char *srcdbpath, char *dstdbpath)
294 : {
295 : RelMapFile map;
296 :
297 : /*
298 : * Read the relmap file from the source database.
299 : */
300 261 : read_relmap_file(&map, srcdbpath, false, ERROR);
301 :
302 : /*
303 : * Write the same data into the destination database's relmap file.
304 : *
305 : * No sinval is needed because no one can be connected to the destination
306 : * database yet.
307 : *
308 : * There's no point in trying to preserve files here. The new database
309 : * isn't usable yet anyway, and won't ever be if we can't install a relmap
310 : * file.
311 : */
312 261 : LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
313 261 : write_relmap_file(&map, true, false, false, dbid, tsid, dstdbpath);
314 261 : LWLockRelease(RelationMappingLock);
315 261 : }
316 :
317 : /*
318 : * RelationMapUpdateMap
319 : *
320 : * Install a new relfilenumber mapping for the specified relation.
321 : *
322 : * If immediate is true (or we're bootstrapping), the mapping is activated
323 : * immediately. Otherwise it is made pending until CommandCounterIncrement.
324 : */
325 : void
326 4165 : RelationMapUpdateMap(Oid relationId, RelFileNumber fileNumber, bool shared,
327 : bool immediate)
328 : {
329 : RelMapFile *map;
330 :
331 4165 : if (IsBootstrapProcessingMode())
332 : {
333 : /*
334 : * In bootstrap mode, the mapping gets installed in permanent map.
335 : */
336 3417 : if (shared)
337 2346 : map = &shared_map;
338 : else
339 1071 : map = &local_map;
340 : }
341 : else
342 : {
343 : /*
344 : * We don't currently support map changes within subtransactions, or
345 : * when in parallel mode. This could be done with more bookkeeping
346 : * infrastructure, but it doesn't presently seem worth it.
347 : */
348 748 : if (GetCurrentTransactionNestLevel() > 1)
349 0 : elog(ERROR, "cannot change relation mapping within subtransaction");
350 :
351 748 : if (IsInParallelMode())
352 0 : elog(ERROR, "cannot change relation mapping in parallel mode");
353 :
354 748 : if (immediate)
355 : {
356 : /* Make it active, but only locally */
357 85 : if (shared)
358 0 : map = &active_shared_updates;
359 : else
360 85 : map = &active_local_updates;
361 : }
362 : else
363 : {
364 : /* Make it pending */
365 663 : if (shared)
366 332 : map = &pending_shared_updates;
367 : else
368 331 : map = &pending_local_updates;
369 : }
370 : }
371 4165 : apply_map_update(map, relationId, fileNumber, true);
372 4165 : }
373 :
374 : /*
375 : * apply_map_update
376 : *
377 : * Insert a new mapping into the given map variable, replacing any existing
378 : * mapping for the same relation.
379 : *
380 : * In some cases the caller knows there must be an existing mapping; pass
381 : * add_okay = false to draw an error if not.
382 : */
383 : static void
384 5390 : apply_map_update(RelMapFile *map, Oid relationId, RelFileNumber fileNumber,
385 : bool add_okay)
386 : {
387 : int32 i;
388 :
389 : /* Replace any existing mapping */
390 78432 : for (i = 0; i < map->num_mappings; i++)
391 : {
392 73905 : if (relationId == map->mappings[i].mapoid)
393 : {
394 863 : map->mappings[i].mapfilenumber = fileNumber;
395 863 : return;
396 : }
397 : }
398 :
399 : /* Nope, need to add a new mapping */
400 4527 : if (!add_okay)
401 0 : elog(ERROR, "attempt to apply a mapping to unmapped relation %u",
402 : relationId);
403 4527 : if (map->num_mappings >= MAX_MAPPINGS)
404 0 : elog(ERROR, "ran out of space in relation map");
405 4527 : map->mappings[map->num_mappings].mapoid = relationId;
406 4527 : map->mappings[map->num_mappings].mapfilenumber = fileNumber;
407 4527 : map->num_mappings++;
408 : }
409 :
410 : /*
411 : * merge_map_updates
412 : *
413 : * Merge all the updates in the given pending-update map into the target map.
414 : * This is just a bulk form of apply_map_update.
415 : */
416 : static void
417 747 : merge_map_updates(RelMapFile *map, const RelMapFile *updates, bool add_okay)
418 : {
419 : int32 i;
420 :
421 1972 : for (i = 0; i < updates->num_mappings; i++)
422 : {
423 1225 : apply_map_update(map,
424 1225 : updates->mappings[i].mapoid,
425 1225 : updates->mappings[i].mapfilenumber,
426 : add_okay);
427 : }
428 747 : }
429 :
430 : /*
431 : * RelationMapRemoveMapping
432 : *
433 : * Remove a relation's entry in the map. This is only allowed for "active"
434 : * (but not committed) local mappings. We need it so we can back out the
435 : * entry for the transient target file when doing VACUUM FULL/CLUSTER on
436 : * a mapped relation.
437 : */
438 : void
439 85 : RelationMapRemoveMapping(Oid relationId)
440 : {
441 85 : RelMapFile *map = &active_local_updates;
442 : int32 i;
443 :
444 128 : for (i = 0; i < map->num_mappings; i++)
445 : {
446 128 : if (relationId == map->mappings[i].mapoid)
447 : {
448 : /* Found it, collapse it out */
449 85 : map->mappings[i] = map->mappings[map->num_mappings - 1];
450 85 : map->num_mappings--;
451 85 : return;
452 : }
453 : }
454 0 : elog(ERROR, "could not find temporary mapping for relation %u",
455 : relationId);
456 : }
457 :
458 : /*
459 : * RelationMapInvalidate
460 : *
461 : * This routine is invoked for SI cache flush messages. We must re-read
462 : * the indicated map file. However, we might receive a SI message in a
463 : * process that hasn't yet, and might never, load the mapping files;
464 : * for example the autovacuum launcher, which *must not* try to read
465 : * a local map since it is attached to no particular database.
466 : * So, re-read only if the map is valid now.
467 : */
468 : void
469 296 : RelationMapInvalidate(bool shared)
470 : {
471 296 : if (shared)
472 : {
473 150 : if (shared_map.magic == RELMAPPER_FILEMAGIC)
474 150 : load_relmap_file(true, false);
475 : }
476 : else
477 : {
478 146 : if (local_map.magic == RELMAPPER_FILEMAGIC)
479 146 : load_relmap_file(false, false);
480 : }
481 296 : }
482 :
483 : /*
484 : * RelationMapInvalidateAll
485 : *
486 : * Reload all map files. This is used to recover from SI message buffer
487 : * overflow: we can't be sure if we missed an inval message.
488 : * Again, reload only currently-valid maps.
489 : */
490 : void
491 2680 : RelationMapInvalidateAll(void)
492 : {
493 2680 : if (shared_map.magic == RELMAPPER_FILEMAGIC)
494 2680 : load_relmap_file(true, false);
495 2680 : if (local_map.magic == RELMAPPER_FILEMAGIC)
496 2488 : load_relmap_file(false, false);
497 2680 : }
498 :
499 : /*
500 : * AtCCI_RelationMap
501 : *
502 : * Activate any "pending" relation map updates at CommandCounterIncrement time.
503 : */
504 : void
505 596498 : AtCCI_RelationMap(void)
506 : {
507 596498 : if (pending_shared_updates.num_mappings != 0)
508 : {
509 306 : merge_map_updates(&active_shared_updates,
510 : &pending_shared_updates,
511 : true);
512 306 : pending_shared_updates.num_mappings = 0;
513 : }
514 596498 : if (pending_local_updates.num_mappings != 0)
515 : {
516 252 : merge_map_updates(&active_local_updates,
517 : &pending_local_updates,
518 : true);
519 252 : pending_local_updates.num_mappings = 0;
520 : }
521 596498 : }
522 :
523 : /*
524 : * AtEOXact_RelationMap
525 : *
526 : * Handle relation mapping at main-transaction commit or abort.
527 : *
528 : * During commit, this must be called as late as possible before the actual
529 : * transaction commit, so as to minimize the window where the transaction
530 : * could still roll back after committing map changes. Although nothing
531 : * critically bad happens in such a case, we still would prefer that it
532 : * not happen, since we'd possibly be losing useful updates to the relations'
533 : * pg_class row(s).
534 : *
535 : * During abort, we just have to throw away any pending map changes.
536 : * Normal post-abort cleanup will take care of fixing relcache entries.
537 : * Parallel worker commit/abort is handled by resetting active mappings
538 : * that may have been received from the leader process. (There should be
539 : * no pending updates in parallel workers.)
540 : */
541 : void
542 552889 : AtEOXact_RelationMap(bool isCommit, bool isParallelWorker)
543 : {
544 552889 : if (isCommit && !isParallelWorker)
545 : {
546 : /*
547 : * We should not get here with any "pending" updates. (We could
548 : * logically choose to treat such as committed, but in the current
549 : * code this should never happen.)
550 : */
551 : Assert(pending_shared_updates.num_mappings == 0);
552 : Assert(pending_local_updates.num_mappings == 0);
553 :
554 : /*
555 : * Write any active updates to the actual map files, then reset them.
556 : */
557 524760 : if (active_shared_updates.num_mappings != 0)
558 : {
559 118 : perform_relmap_update(true, &active_shared_updates);
560 118 : active_shared_updates.num_mappings = 0;
561 : }
562 524760 : if (active_local_updates.num_mappings != 0)
563 : {
564 71 : perform_relmap_update(false, &active_local_updates);
565 71 : active_local_updates.num_mappings = 0;
566 : }
567 : }
568 : else
569 : {
570 : /* Abort or parallel worker --- drop all local and pending updates */
571 : Assert(!isParallelWorker || pending_shared_updates.num_mappings == 0);
572 : Assert(!isParallelWorker || pending_local_updates.num_mappings == 0);
573 :
574 28129 : active_shared_updates.num_mappings = 0;
575 28129 : active_local_updates.num_mappings = 0;
576 28129 : pending_shared_updates.num_mappings = 0;
577 28129 : pending_local_updates.num_mappings = 0;
578 : }
579 552889 : }
580 :
581 : /*
582 : * AtPrepare_RelationMap
583 : *
584 : * Handle relation mapping at PREPARE.
585 : *
586 : * Currently, we don't support preparing any transaction that changes the map.
587 : */
588 : void
589 331 : AtPrepare_RelationMap(void)
590 : {
591 331 : if (active_shared_updates.num_mappings != 0 ||
592 331 : active_local_updates.num_mappings != 0 ||
593 331 : pending_shared_updates.num_mappings != 0 ||
594 331 : pending_local_updates.num_mappings != 0)
595 0 : ereport(ERROR,
596 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
597 : errmsg("cannot PREPARE a transaction that modified relation mapping")));
598 331 : }
599 :
600 : /*
601 : * CheckPointRelationMap
602 : *
603 : * This is called during a checkpoint. It must ensure that any relation map
604 : * updates that were WAL-logged before the start of the checkpoint are
605 : * securely flushed to disk and will not need to be replayed later. This
606 : * seems unlikely to be a performance-critical issue, so we use a simple
607 : * method: we just take and release the RelationMappingLock. This ensures
608 : * that any already-logged map update is complete, because write_relmap_file
609 : * will fsync the map file before the lock is released.
610 : */
611 : void
612 1806 : CheckPointRelationMap(void)
613 : {
614 1806 : LWLockAcquire(RelationMappingLock, LW_SHARED);
615 1806 : LWLockRelease(RelationMappingLock);
616 1806 : }
617 :
618 : /*
619 : * RelationMapFinishBootstrap
620 : *
621 : * Write out the initial relation mapping files at the completion of
622 : * bootstrap. All the mapped files should have been made known to us
623 : * via RelationMapUpdateMap calls.
624 : */
625 : void
626 51 : RelationMapFinishBootstrap(void)
627 : {
628 : Assert(IsBootstrapProcessingMode());
629 :
630 : /* Shouldn't be anything "pending" ... */
631 : Assert(active_shared_updates.num_mappings == 0);
632 : Assert(active_local_updates.num_mappings == 0);
633 : Assert(pending_shared_updates.num_mappings == 0);
634 : Assert(pending_local_updates.num_mappings == 0);
635 :
636 : /* Write the files; no WAL or sinval needed */
637 51 : LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
638 51 : write_relmap_file(&shared_map, false, false, false,
639 : InvalidOid, GLOBALTABLESPACE_OID, "global");
640 51 : write_relmap_file(&local_map, false, false, false,
641 : MyDatabaseId, MyDatabaseTableSpace, DatabasePath);
642 51 : LWLockRelease(RelationMappingLock);
643 51 : }
644 :
645 : /*
646 : * RelationMapInitialize
647 : *
648 : * This initializes the mapper module at process startup. We can't access the
649 : * database yet, so just make sure the maps are empty.
650 : */
651 : void
652 18757 : RelationMapInitialize(void)
653 : {
654 : /* The static variables should initialize to zeroes, but let's be sure */
655 18757 : shared_map.magic = 0; /* mark it not loaded */
656 18757 : local_map.magic = 0;
657 18757 : shared_map.num_mappings = 0;
658 18757 : local_map.num_mappings = 0;
659 18757 : active_shared_updates.num_mappings = 0;
660 18757 : active_local_updates.num_mappings = 0;
661 18757 : pending_shared_updates.num_mappings = 0;
662 18757 : pending_local_updates.num_mappings = 0;
663 18757 : }
664 :
665 : /*
666 : * RelationMapInitializePhase2
667 : *
668 : * This is called to prepare for access to pg_database during startup.
669 : * We should be able to read the shared map file now.
670 : */
671 : void
672 18757 : RelationMapInitializePhase2(void)
673 : {
674 : /*
675 : * In bootstrap mode, the map file isn't there yet, so do nothing.
676 : */
677 18757 : if (IsBootstrapProcessingMode())
678 51 : return;
679 :
680 : /*
681 : * Load the shared map file, die on error.
682 : */
683 18706 : load_relmap_file(true, false);
684 : }
685 :
686 : /*
687 : * RelationMapInitializePhase3
688 : *
689 : * This is called as soon as we have determined MyDatabaseId and set up
690 : * DatabasePath. At this point we should be able to read the local map file.
691 : */
692 : void
693 17277 : RelationMapInitializePhase3(void)
694 : {
695 : /*
696 : * In bootstrap mode, the map file isn't there yet, so do nothing.
697 : */
698 17277 : if (IsBootstrapProcessingMode())
699 51 : return;
700 :
701 : /*
702 : * Load the local map file, die on error.
703 : */
704 17226 : load_relmap_file(false, false);
705 : }
706 :
707 : /*
708 : * EstimateRelationMapSpace
709 : *
710 : * Estimate space needed to pass active shared and local relmaps to parallel
711 : * workers.
712 : */
713 : Size
714 498 : EstimateRelationMapSpace(void)
715 : {
716 498 : return sizeof(SerializedActiveRelMaps);
717 : }
718 :
719 : /*
720 : * SerializeRelationMap
721 : *
722 : * Serialize active shared and local relmap state for parallel workers.
723 : */
724 : void
725 498 : SerializeRelationMap(Size maxSize, char *startAddress)
726 : {
727 : SerializedActiveRelMaps *relmaps;
728 :
729 : Assert(maxSize >= EstimateRelationMapSpace());
730 :
731 498 : relmaps = (SerializedActiveRelMaps *) startAddress;
732 498 : relmaps->active_shared_updates = active_shared_updates;
733 498 : relmaps->active_local_updates = active_local_updates;
734 498 : }
735 :
736 : /*
737 : * RestoreRelationMap
738 : *
739 : * Restore active shared and local relmap state within a parallel worker.
740 : */
741 : void
742 1483 : RestoreRelationMap(char *startAddress)
743 : {
744 : SerializedActiveRelMaps *relmaps;
745 :
746 1483 : if (active_shared_updates.num_mappings != 0 ||
747 1483 : active_local_updates.num_mappings != 0 ||
748 1483 : pending_shared_updates.num_mappings != 0 ||
749 1483 : pending_local_updates.num_mappings != 0)
750 0 : elog(ERROR, "parallel worker has existing mappings");
751 :
752 1483 : relmaps = (SerializedActiveRelMaps *) startAddress;
753 1483 : active_shared_updates = relmaps->active_shared_updates;
754 1483 : active_local_updates = relmaps->active_local_updates;
755 1483 : }
756 :
757 : /*
758 : * load_relmap_file -- load the shared or local map file
759 : *
760 : * Because these files are essential for access to core system catalogs,
761 : * failure to load either of them is a fatal error.
762 : *
763 : * Note that the local case requires DatabasePath to be set up.
764 : */
765 : static void
766 41585 : load_relmap_file(bool shared, bool lock_held)
767 : {
768 41585 : if (shared)
769 21654 : read_relmap_file(&shared_map, "global", lock_held, FATAL);
770 : else
771 19931 : read_relmap_file(&local_map, DatabasePath, lock_held, FATAL);
772 41585 : }
773 :
774 : /*
775 : * read_relmap_file -- load data from any relation mapper file
776 : *
777 : * dbpath must be the relevant database path, or "global" for shared relations.
778 : *
779 : * RelationMappingLock will be acquired released unless lock_held = true.
780 : *
781 : * Errors will be reported at the indicated elevel, which should be at least
782 : * ERROR.
783 : */
784 : static void
785 46544 : read_relmap_file(RelMapFile *map, char *dbpath, bool lock_held, int elevel)
786 : {
787 : char mapfilename[MAXPGPATH];
788 : pg_crc32c crc;
789 : int fd;
790 : int r;
791 :
792 : Assert(elevel >= ERROR);
793 :
794 : /*
795 : * Grab the lock to prevent the file from being updated while we read it,
796 : * unless the caller is already holding the lock. If the file is updated
797 : * shortly after we look, the sinval signaling mechanism will make us
798 : * re-read it before we are able to access any relation that's affected by
799 : * the change.
800 : */
801 46544 : if (!lock_held)
802 46355 : LWLockAcquire(RelationMappingLock, LW_SHARED);
803 :
804 : /*
805 : * Open the target file.
806 : *
807 : * Because Windows isn't happy about the idea of renaming over a file that
808 : * someone has open, we only open this file after acquiring the lock, and
809 : * for the same reason, we close it before releasing the lock. That way,
810 : * by the time write_relmap_file() acquires an exclusive lock, no one else
811 : * will have it open.
812 : */
813 46544 : snprintf(mapfilename, sizeof(mapfilename), "%s/%s", dbpath,
814 : RELMAPPER_FILENAME);
815 46544 : fd = OpenTransientFile(mapfilename, O_RDONLY | PG_BINARY);
816 46544 : if (fd < 0)
817 0 : ereport(elevel,
818 : (errcode_for_file_access(),
819 : errmsg("could not open file \"%s\": %m",
820 : mapfilename)));
821 :
822 : /* Now read the data. */
823 46544 : pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_READ);
824 46544 : r = read(fd, map, sizeof(RelMapFile));
825 46544 : if (r != sizeof(RelMapFile))
826 : {
827 0 : if (r < 0)
828 0 : ereport(elevel,
829 : (errcode_for_file_access(),
830 : errmsg("could not read file \"%s\": %m", mapfilename)));
831 : else
832 0 : ereport(elevel,
833 : (errcode(ERRCODE_DATA_CORRUPTED),
834 : errmsg("could not read file \"%s\": read %d of %zu",
835 : mapfilename, r, sizeof(RelMapFile))));
836 : }
837 46544 : pgstat_report_wait_end();
838 :
839 46544 : if (CloseTransientFile(fd) != 0)
840 0 : ereport(elevel,
841 : (errcode_for_file_access(),
842 : errmsg("could not close file \"%s\": %m",
843 : mapfilename)));
844 :
845 46544 : if (!lock_held)
846 46355 : LWLockRelease(RelationMappingLock);
847 :
848 : /* check for correct magic number, etc */
849 46544 : if (map->magic != RELMAPPER_FILEMAGIC ||
850 46544 : map->num_mappings < 0 ||
851 46544 : map->num_mappings > MAX_MAPPINGS)
852 0 : ereport(elevel,
853 : (errmsg("relation mapping file \"%s\" contains invalid data",
854 : mapfilename)));
855 :
856 : /* verify the CRC */
857 46544 : INIT_CRC32C(crc);
858 46544 : COMP_CRC32C(crc, map, offsetof(RelMapFile, crc));
859 46544 : FIN_CRC32C(crc);
860 :
861 46544 : if (!EQ_CRC32C(crc, map->crc))
862 0 : ereport(elevel,
863 : (errmsg("relation mapping file \"%s\" contains incorrect checksum",
864 : mapfilename)));
865 46544 : }
866 :
867 : /*
868 : * Write out a new shared or local map file with the given contents.
869 : *
870 : * The magic number and CRC are automatically updated in *newmap. On
871 : * success, we copy the data to the appropriate permanent static variable.
872 : *
873 : * If write_wal is true then an appropriate WAL message is emitted.
874 : * (It will be false for bootstrap and WAL replay cases.)
875 : *
876 : * If send_sinval is true then a SI invalidation message is sent.
877 : * (This should be true except in bootstrap case.)
878 : *
879 : * If preserve_files is true then the storage manager is warned not to
880 : * delete the files listed in the map.
881 : *
882 : * Because this may be called during WAL replay when MyDatabaseId,
883 : * DatabasePath, etc aren't valid, we require the caller to pass in suitable
884 : * values. Pass dbpath as "global" for the shared map.
885 : *
886 : * The caller is also responsible for being sure no concurrent map update
887 : * could be happening.
888 : */
889 : static void
890 581 : write_relmap_file(RelMapFile *newmap, bool write_wal, bool send_sinval,
891 : bool preserve_files, Oid dbid, Oid tsid, const char *dbpath)
892 : {
893 : int fd;
894 : char mapfilename[MAXPGPATH];
895 : char maptempfilename[MAXPGPATH];
896 :
897 : /*
898 : * Even without concurrent use of this map, CheckPointRelationMap() relies
899 : * on this locking. Without it, a restore of a base backup taken after
900 : * this function's XLogInsert() and before its durable_rename() would not
901 : * have the changes. wal_level=minimal doesn't need the lock, but this
902 : * isn't performance-critical enough for such a micro-optimization.
903 : */
904 : Assert(LWLockHeldByMeInMode(RelationMappingLock, LW_EXCLUSIVE));
905 :
906 : /*
907 : * Fill in the overhead fields and update CRC.
908 : */
909 581 : newmap->magic = RELMAPPER_FILEMAGIC;
910 581 : if (newmap->num_mappings < 0 || newmap->num_mappings > MAX_MAPPINGS)
911 0 : elog(ERROR, "attempt to write bogus relation mapping");
912 :
913 581 : INIT_CRC32C(newmap->crc);
914 581 : COMP_CRC32C(newmap->crc, newmap, offsetof(RelMapFile, crc));
915 581 : FIN_CRC32C(newmap->crc);
916 :
917 : /*
918 : * Construct filenames -- a temporary file that we'll create to write the
919 : * data initially, and then the permanent name to which we will rename it.
920 : */
921 581 : snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
922 : dbpath, RELMAPPER_FILENAME);
923 581 : snprintf(maptempfilename, sizeof(maptempfilename), "%s/%s",
924 : dbpath, RELMAPPER_TEMP_FILENAME);
925 :
926 : /*
927 : * Open a temporary file. If a file already exists with this name, it must
928 : * be left over from a previous crash, so we can overwrite it. Concurrent
929 : * calls to this function are not allowed.
930 : */
931 581 : fd = OpenTransientFile(maptempfilename,
932 : O_WRONLY | O_CREAT | O_TRUNC | PG_BINARY);
933 581 : if (fd < 0)
934 0 : ereport(ERROR,
935 : (errcode_for_file_access(),
936 : errmsg("could not open file \"%s\": %m",
937 : maptempfilename)));
938 :
939 : /* Write new data to the file. */
940 581 : pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_WRITE);
941 581 : if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile))
942 : {
943 : /* if write didn't set errno, assume problem is no disk space */
944 0 : if (errno == 0)
945 0 : errno = ENOSPC;
946 0 : ereport(ERROR,
947 : (errcode_for_file_access(),
948 : errmsg("could not write file \"%s\": %m",
949 : maptempfilename)));
950 : }
951 581 : pgstat_report_wait_end();
952 :
953 : /* And close the file. */
954 581 : if (CloseTransientFile(fd) != 0)
955 0 : ereport(ERROR,
956 : (errcode_for_file_access(),
957 : errmsg("could not close file \"%s\": %m",
958 : maptempfilename)));
959 :
960 581 : if (write_wal)
961 : {
962 : xl_relmap_update xlrec;
963 : XLogRecPtr lsn;
964 :
965 : /* now errors are fatal ... */
966 450 : START_CRIT_SECTION();
967 :
968 450 : xlrec.dbid = dbid;
969 450 : xlrec.tsid = tsid;
970 450 : xlrec.nbytes = sizeof(RelMapFile);
971 :
972 450 : XLogBeginInsert();
973 450 : XLogRegisterData(&xlrec, MinSizeOfRelmapUpdate);
974 450 : XLogRegisterData(newmap, sizeof(RelMapFile));
975 :
976 450 : lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE);
977 :
978 : /* As always, WAL must hit the disk before the data update does */
979 450 : XLogFlush(lsn);
980 : }
981 :
982 : /*
983 : * durable_rename() does all the hard work of making sure that we rename
984 : * the temporary file into place in a crash-safe manner.
985 : *
986 : * NB: Although we instruct durable_rename() to use ERROR, we will often
987 : * be in a critical section at this point; if so, ERROR will become PANIC.
988 : */
989 581 : pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_REPLACE);
990 581 : durable_rename(maptempfilename, mapfilename, ERROR);
991 581 : pgstat_report_wait_end();
992 :
993 : /*
994 : * Now that the file is safely on disk, send sinval message to let other
995 : * backends know to re-read it. We must do this inside the critical
996 : * section: if for some reason we fail to send the message, we have to
997 : * force a database-wide PANIC. Otherwise other backends might continue
998 : * execution with stale mapping information, which would be catastrophic
999 : * as soon as others began to use the now-committed data.
1000 : */
1001 581 : if (send_sinval)
1002 218 : CacheInvalidateRelmap(dbid);
1003 :
1004 : /*
1005 : * Make sure that the files listed in the map are not deleted if the outer
1006 : * transaction aborts. This had better be within the critical section
1007 : * too: it's not likely to fail, but if it did, we'd arrive at transaction
1008 : * abort with the files still vulnerable. PANICing will leave things in a
1009 : * good state on-disk.
1010 : *
1011 : * Note: we're cheating a little bit here by assuming that mapped files
1012 : * are either in pg_global or the database's default tablespace.
1013 : */
1014 581 : if (preserve_files)
1015 : {
1016 : int32 i;
1017 :
1018 6824 : for (i = 0; i < newmap->num_mappings; i++)
1019 : {
1020 : RelFileLocator rlocator;
1021 :
1022 6635 : rlocator.spcOid = tsid;
1023 6635 : rlocator.dbOid = dbid;
1024 6635 : rlocator.relNumber = newmap->mappings[i].mapfilenumber;
1025 6635 : RelationPreserveStorage(rlocator, false);
1026 : }
1027 : }
1028 :
1029 : /* Critical section done */
1030 581 : if (write_wal)
1031 450 : END_CRIT_SECTION();
1032 581 : }
1033 :
1034 : /*
1035 : * Merge the specified updates into the appropriate "real" map,
1036 : * and write out the changes. This function must be used for committing
1037 : * updates during normal multiuser operation.
1038 : */
1039 : static void
1040 189 : perform_relmap_update(bool shared, const RelMapFile *updates)
1041 : {
1042 : RelMapFile newmap;
1043 :
1044 : /*
1045 : * Anyone updating a relation's mapping info should take exclusive lock on
1046 : * that rel and hold it until commit. This ensures that there will not be
1047 : * concurrent updates on the same mapping value; but there could easily be
1048 : * concurrent updates on different values in the same file. We cover that
1049 : * by acquiring the RelationMappingLock, re-reading the target file to
1050 : * ensure it's up to date, applying the updates, and writing the data
1051 : * before releasing RelationMappingLock.
1052 : *
1053 : * There is only one RelationMappingLock. In principle we could try to
1054 : * have one per mapping file, but it seems unlikely to be worth the
1055 : * trouble.
1056 : */
1057 189 : LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
1058 :
1059 : /* Be certain we see any other updates just made */
1060 189 : load_relmap_file(shared, true);
1061 :
1062 : /* Prepare updated data in a local variable */
1063 189 : if (shared)
1064 118 : memcpy(&newmap, &shared_map, sizeof(RelMapFile));
1065 : else
1066 71 : memcpy(&newmap, &local_map, sizeof(RelMapFile));
1067 :
1068 : /*
1069 : * Apply the updates to newmap. No new mappings should appear, unless
1070 : * somebody is adding indexes to system catalogs.
1071 : */
1072 189 : merge_map_updates(&newmap, updates, allowSystemTableMods);
1073 :
1074 : /* Write out the updated map and do other necessary tasks */
1075 189 : write_relmap_file(&newmap, true, true, true,
1076 : (shared ? InvalidOid : MyDatabaseId),
1077 : (shared ? GLOBALTABLESPACE_OID : MyDatabaseTableSpace),
1078 : (shared ? "global" : DatabasePath));
1079 :
1080 : /*
1081 : * We successfully wrote the updated file, so it's now safe to rely on the
1082 : * new values in this process, too.
1083 : */
1084 189 : if (shared)
1085 118 : memcpy(&shared_map, &newmap, sizeof(RelMapFile));
1086 : else
1087 71 : memcpy(&local_map, &newmap, sizeof(RelMapFile));
1088 :
1089 : /* Now we can release the lock */
1090 189 : LWLockRelease(RelationMappingLock);
1091 189 : }
1092 :
1093 : /*
1094 : * RELMAP resource manager's routines
1095 : */
1096 : void
1097 29 : relmap_redo(XLogReaderState *record)
1098 : {
1099 29 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
1100 :
1101 : /* Backup blocks are not used in relmap records */
1102 : Assert(!XLogRecHasAnyBlockRefs(record));
1103 :
1104 29 : if (info == XLOG_RELMAP_UPDATE)
1105 : {
1106 29 : xl_relmap_update *xlrec = (xl_relmap_update *) XLogRecGetData(record);
1107 : RelMapFile newmap;
1108 : char *dbpath;
1109 :
1110 29 : if (xlrec->nbytes != sizeof(RelMapFile))
1111 0 : elog(PANIC, "relmap_redo: wrong size %u in relmap update record",
1112 : xlrec->nbytes);
1113 29 : memcpy(&newmap, xlrec->data, sizeof(newmap));
1114 :
1115 : /* We need to construct the pathname for this database */
1116 29 : dbpath = GetDatabasePath(xlrec->dbid, xlrec->tsid);
1117 :
1118 : /*
1119 : * Write out the new map and send sinval, but of course don't write a
1120 : * new WAL entry. There's no surrounding transaction to tell to
1121 : * preserve files, either.
1122 : *
1123 : * There shouldn't be anyone else updating relmaps during WAL replay,
1124 : * but grab the lock to interlock against load_relmap_file().
1125 : *
1126 : * Note that we use the same WAL record for updating the relmap of an
1127 : * existing database as we do for creating a new database. In the
1128 : * latter case, taking the relmap log and sending sinval messages is
1129 : * unnecessary, but harmless. If we wanted to avoid it, we could add a
1130 : * flag to the WAL record to indicate which operation is being
1131 : * performed.
1132 : */
1133 29 : LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
1134 29 : write_relmap_file(&newmap, false, true, false,
1135 : xlrec->dbid, xlrec->tsid, dbpath);
1136 29 : LWLockRelease(RelationMappingLock);
1137 :
1138 29 : pfree(dbpath);
1139 : }
1140 : else
1141 0 : elog(PANIC, "relmap_redo: unknown op code %u", info);
1142 29 : }
|