Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * relmapper.c
4 : * Catalog-to-filenumber mapping
5 : *
6 : * For most tables, the physical file underlying the table is specified by
7 : * pg_class.relfilenode. However, that obviously won't work for pg_class
8 : * itself, nor for the other "nailed" catalogs for which we have to be able
9 : * to set up working Relation entries without access to pg_class. It also
10 : * does not work for shared catalogs, since there is no practical way to
11 : * update other databases' pg_class entries when relocating a shared catalog.
12 : * Therefore, for these special catalogs (henceforth referred to as "mapped
13 : * catalogs") we rely on a separately maintained file that shows the mapping
14 : * from catalog OIDs to filenumbers. Each database has a map file for
15 : * its local mapped catalogs, and there is a separate map file for shared
16 : * catalogs. Mapped catalogs have zero in their pg_class.relfilenode entries.
17 : *
18 : * Relocation of a normal table is committed (ie, the new physical file becomes
19 : * authoritative) when the pg_class row update commits. For mapped catalogs,
20 : * the act of updating the map file is effectively commit of the relocation.
21 : * We postpone the file update till just before commit of the transaction
22 : * doing the rewrite, but there is necessarily a window between. Therefore
23 : * mapped catalogs can only be relocated by operations such as VACUUM FULL
24 : * and CLUSTER, which make no transactionally-significant changes: it must be
25 : * safe for the new file to replace the old, even if the transaction itself
26 : * aborts. An important factor here is that the indexes and toast table of
27 : * a mapped catalog must also be mapped, so that the rewrites/relocations of
28 : * all these files commit in a single map file update rather than being tied
29 : * to transaction commit.
30 : *
31 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
32 : * Portions Copyright (c) 1994, Regents of the University of California
33 : *
34 : *
35 : * IDENTIFICATION
36 : * src/backend/utils/cache/relmapper.c
37 : *
38 : *-------------------------------------------------------------------------
39 : */
40 : #include "postgres.h"
41 :
42 : #include <fcntl.h>
43 : #include <sys/stat.h>
44 : #include <unistd.h>
45 :
46 : #include "access/xact.h"
47 : #include "access/xlog.h"
48 : #include "access/xloginsert.h"
49 : #include "catalog/pg_tablespace.h"
50 : #include "catalog/storage.h"
51 : #include "miscadmin.h"
52 : #include "pgstat.h"
53 : #include "storage/fd.h"
54 : #include "storage/lwlock.h"
55 : #include "utils/inval.h"
56 : #include "utils/relmapper.h"
57 :
58 :
59 : /*
60 : * The map file is critical data: we have no automatic method for recovering
61 : * from loss or corruption of it. We use a CRC so that we can detect
62 : * corruption. Since the file might be more than one standard-size disk
63 : * sector in size, we cannot rely on overwrite-in-place. Instead, we generate
64 : * a new file and rename it into place, atomically replacing the original file.
65 : *
66 : * Entries in the mappings[] array are in no particular order. We could
67 : * speed searching by insisting on OID order, but it really shouldn't be
68 : * worth the trouble given the intended size of the mapping sets.
69 : */
70 : #define RELMAPPER_FILENAME "pg_filenode.map"
71 : #define RELMAPPER_TEMP_FILENAME "pg_filenode.map.tmp"
72 :
73 : #define RELMAPPER_FILEMAGIC 0x592717 /* version ID value */
74 :
75 : /*
76 : * There's no need for this constant to have any particular value, and we
77 : * can raise it as necessary if we end up with more mapped relations. For
78 : * now, we just pick a round number that is modestly larger than the expected
79 : * number of mappings.
80 : */
81 : #define MAX_MAPPINGS 64
82 :
83 : typedef struct RelMapping
84 : {
85 : Oid mapoid; /* OID of a catalog */
86 : RelFileNumber mapfilenumber; /* its rel file number */
87 : } RelMapping;
88 :
89 : typedef struct RelMapFile
90 : {
91 : int32 magic; /* always RELMAPPER_FILEMAGIC */
92 : int32 num_mappings; /* number of valid RelMapping entries */
93 : RelMapping mappings[MAX_MAPPINGS];
94 : pg_crc32c crc; /* CRC of all above */
95 : } RelMapFile;
96 :
97 : /*
98 : * State for serializing local and shared relmappings for parallel workers
99 : * (active states only). See notes on active_* and pending_* updates state.
100 : */
101 : typedef struct SerializedActiveRelMaps
102 : {
103 : RelMapFile active_shared_updates;
104 : RelMapFile active_local_updates;
105 : } SerializedActiveRelMaps;
106 :
107 : /*
108 : * The currently known contents of the shared map file and our database's
109 : * local map file are stored here. These can be reloaded from disk
110 : * immediately whenever we receive an update sinval message.
111 : */
112 : static RelMapFile shared_map;
113 : static RelMapFile local_map;
114 :
115 : /*
116 : * We use the same RelMapFile data structure to track uncommitted local
117 : * changes in the mappings (but note the magic and crc fields are not made
118 : * valid in these variables). Currently, map updates are not allowed within
119 : * subtransactions, so one set of transaction-level changes is sufficient.
120 : *
121 : * The active_xxx variables contain updates that are valid in our transaction
122 : * and should be honored by RelationMapOidToFilenumber. The pending_xxx
123 : * variables contain updates we have been told about that aren't active yet;
124 : * they will become active at the next CommandCounterIncrement. This setup
125 : * lets map updates act similarly to updates of pg_class rows, ie, they
126 : * become visible only at the next CommandCounterIncrement boundary.
127 : *
128 : * Active shared and active local updates are serialized by the parallel
129 : * infrastructure, and deserialized within parallel workers.
130 : */
131 : static RelMapFile active_shared_updates;
132 : static RelMapFile active_local_updates;
133 : static RelMapFile pending_shared_updates;
134 : static RelMapFile pending_local_updates;
135 :
136 :
137 : /* non-export function prototypes */
138 : static void apply_map_update(RelMapFile *map, Oid relationId,
139 : RelFileNumber fileNumber, bool add_okay);
140 : static void merge_map_updates(RelMapFile *map, const RelMapFile *updates,
141 : bool add_okay);
142 : static void load_relmap_file(bool shared, bool lock_held);
143 : static void read_relmap_file(RelMapFile *map, char *dbpath, bool lock_held,
144 : int elevel);
145 : static void write_relmap_file(RelMapFile *newmap, bool write_wal,
146 : bool send_sinval, bool preserve_files,
147 : Oid dbid, Oid tsid, const char *dbpath);
148 : static void perform_relmap_update(bool shared, const RelMapFile *updates);
149 :
150 :
151 : /*
152 : * RelationMapOidToFilenumber
153 : *
154 : * The raison d' etre ... given a relation OID, look up its filenumber.
155 : *
156 : * Although shared and local relation OIDs should never overlap, the caller
157 : * always knows which we need --- so pass that information to avoid useless
158 : * searching.
159 : *
160 : * Returns InvalidRelFileNumber if the OID is not known (which should never
161 : * happen, but the caller is in a better position to report a meaningful
162 : * error).
163 : */
164 : RelFileNumber
165 1445416 : RelationMapOidToFilenumber(Oid relationId, bool shared)
166 : {
167 : const RelMapFile *map;
168 : int32 i;
169 :
170 : /* If there are active updates, believe those over the main maps */
171 1445416 : if (shared)
172 : {
173 890102 : map = &active_shared_updates;
174 892680 : for (i = 0; i < map->num_mappings; i++)
175 : {
176 3868 : if (relationId == map->mappings[i].mapoid)
177 1290 : return map->mappings[i].mapfilenumber;
178 : }
179 888812 : map = &shared_map;
180 22111340 : for (i = 0; i < map->num_mappings; i++)
181 : {
182 22111340 : if (relationId == map->mappings[i].mapoid)
183 888812 : return map->mappings[i].mapfilenumber;
184 : }
185 : }
186 : else
187 : {
188 555314 : map = &active_local_updates;
189 559462 : for (i = 0; i < map->num_mappings; i++)
190 : {
191 6468 : if (relationId == map->mappings[i].mapoid)
192 2320 : return map->mappings[i].mapfilenumber;
193 : }
194 552994 : map = &local_map;
195 4138102 : for (i = 0; i < map->num_mappings; i++)
196 : {
197 4138102 : if (relationId == map->mappings[i].mapoid)
198 552994 : return map->mappings[i].mapfilenumber;
199 : }
200 : }
201 :
202 0 : return InvalidRelFileNumber;
203 : }
204 :
205 : /*
206 : * RelationMapFilenumberToOid
207 : *
208 : * Do the reverse of the normal direction of mapping done in
209 : * RelationMapOidToFilenumber.
210 : *
211 : * This is not supposed to be used during normal running but rather for
212 : * information purposes when looking at the filesystem or xlog.
213 : *
214 : * Returns InvalidOid if the OID is not known; this can easily happen if the
215 : * relfilenumber doesn't pertain to a mapped relation.
216 : */
217 : Oid
218 1090 : RelationMapFilenumberToOid(RelFileNumber filenumber, bool shared)
219 : {
220 : const RelMapFile *map;
221 : int32 i;
222 :
223 : /* If there are active updates, believe those over the main maps */
224 1090 : if (shared)
225 : {
226 306 : map = &active_shared_updates;
227 306 : for (i = 0; i < map->num_mappings; i++)
228 : {
229 0 : if (filenumber == map->mappings[i].mapfilenumber)
230 0 : return map->mappings[i].mapoid;
231 : }
232 306 : map = &shared_map;
233 7442 : for (i = 0; i < map->num_mappings; i++)
234 : {
235 7442 : if (filenumber == map->mappings[i].mapfilenumber)
236 306 : return map->mappings[i].mapoid;
237 : }
238 : }
239 : else
240 : {
241 784 : map = &active_local_updates;
242 784 : for (i = 0; i < map->num_mappings; i++)
243 : {
244 0 : if (filenumber == map->mappings[i].mapfilenumber)
245 0 : return map->mappings[i].mapoid;
246 : }
247 784 : map = &local_map;
248 4238 : for (i = 0; i < map->num_mappings; i++)
249 : {
250 4126 : if (filenumber == map->mappings[i].mapfilenumber)
251 672 : return map->mappings[i].mapoid;
252 : }
253 : }
254 :
255 112 : return InvalidOid;
256 : }
257 :
258 : /*
259 : * RelationMapOidToFilenumberForDatabase
260 : *
261 : * Like RelationMapOidToFilenumber, but reads the mapping from the indicated
262 : * path instead of using the one for the current database.
263 : */
264 : RelFileNumber
265 7920 : RelationMapOidToFilenumberForDatabase(char *dbpath, Oid relationId)
266 : {
267 : RelMapFile map;
268 : int i;
269 :
270 : /* Read the relmap file from the source database. */
271 7920 : read_relmap_file(&map, dbpath, false, ERROR);
272 :
273 : /* Iterate over the relmap entries to find the input relation OID. */
274 67760 : for (i = 0; i < map.num_mappings; i++)
275 : {
276 67760 : if (relationId == map.mappings[i].mapoid)
277 7920 : return map.mappings[i].mapfilenumber;
278 : }
279 :
280 0 : return InvalidRelFileNumber;
281 : }
282 :
283 : /*
284 : * RelationMapCopy
285 : *
286 : * Copy relmapfile from source db path to the destination db path and WAL log
287 : * the operation. This is intended for use in creating a new relmap file
288 : * for a database that doesn't have one yet, not for replacing an existing
289 : * relmap file.
290 : */
291 : void
292 440 : RelationMapCopy(Oid dbid, Oid tsid, char *srcdbpath, char *dstdbpath)
293 : {
294 : RelMapFile map;
295 :
296 : /*
297 : * Read the relmap file from the source database.
298 : */
299 440 : read_relmap_file(&map, srcdbpath, false, ERROR);
300 :
301 : /*
302 : * Write the same data into the destination database's relmap file.
303 : *
304 : * No sinval is needed because no one can be connected to the destination
305 : * database yet.
306 : *
307 : * There's no point in trying to preserve files here. The new database
308 : * isn't usable yet anyway, and won't ever be if we can't install a relmap
309 : * file.
310 : */
311 440 : LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
312 440 : write_relmap_file(&map, true, false, false, dbid, tsid, dstdbpath);
313 440 : LWLockRelease(RelationMappingLock);
314 440 : }
315 :
316 : /*
317 : * RelationMapUpdateMap
318 : *
319 : * Install a new relfilenumber mapping for the specified relation.
320 : *
321 : * If immediate is true (or we're bootstrapping), the mapping is activated
322 : * immediately. Otherwise it is made pending until CommandCounterIncrement.
323 : */
324 : void
325 7620 : RelationMapUpdateMap(Oid relationId, RelFileNumber fileNumber, bool shared,
326 : bool immediate)
327 : {
328 : RelMapFile *map;
329 :
330 7620 : if (IsBootstrapProcessingMode())
331 : {
332 : /*
333 : * In bootstrap mode, the mapping gets installed in permanent map.
334 : */
335 6210 : if (shared)
336 4320 : map = &shared_map;
337 : else
338 1890 : map = &local_map;
339 : }
340 : else
341 : {
342 : /*
343 : * We don't currently support map changes within subtransactions, or
344 : * when in parallel mode. This could be done with more bookkeeping
345 : * infrastructure, but it doesn't presently seem worth it.
346 : */
347 1410 : if (GetCurrentTransactionNestLevel() > 1)
348 0 : elog(ERROR, "cannot change relation mapping within subtransaction");
349 :
350 1410 : if (IsInParallelMode())
351 0 : elog(ERROR, "cannot change relation mapping in parallel mode");
352 :
353 1410 : if (immediate)
354 : {
355 : /* Make it active, but only locally */
356 168 : if (shared)
357 0 : map = &active_shared_updates;
358 : else
359 168 : map = &active_local_updates;
360 : }
361 : else
362 : {
363 : /* Make it pending */
364 1242 : if (shared)
365 628 : map = &pending_shared_updates;
366 : else
367 614 : map = &pending_local_updates;
368 : }
369 : }
370 7620 : apply_map_update(map, relationId, fileNumber, true);
371 7620 : }
372 :
373 : /*
374 : * apply_map_update
375 : *
376 : * Insert a new mapping into the given map variable, replacing any existing
377 : * mapping for the same relation.
378 : *
379 : * In some cases the caller knows there must be an existing mapping; pass
380 : * add_okay = false to draw an error if not.
381 : */
382 : static void
383 9904 : apply_map_update(RelMapFile *map, Oid relationId, RelFileNumber fileNumber,
384 : bool add_okay)
385 : {
386 : int32 i;
387 :
388 : /* Replace any existing mapping */
389 149268 : for (i = 0; i < map->num_mappings; i++)
390 : {
391 140958 : if (relationId == map->mappings[i].mapoid)
392 : {
393 1594 : map->mappings[i].mapfilenumber = fileNumber;
394 1594 : return;
395 : }
396 : }
397 :
398 : /* Nope, need to add a new mapping */
399 8310 : if (!add_okay)
400 0 : elog(ERROR, "attempt to apply a mapping to unmapped relation %u",
401 : relationId);
402 8310 : if (map->num_mappings >= MAX_MAPPINGS)
403 0 : elog(ERROR, "ran out of space in relation map");
404 8310 : map->mappings[map->num_mappings].mapoid = relationId;
405 8310 : map->mappings[map->num_mappings].mapfilenumber = fileNumber;
406 8310 : map->num_mappings++;
407 : }
408 :
409 : /*
410 : * merge_map_updates
411 : *
412 : * Merge all the updates in the given pending-update map into the target map.
413 : * This is just a bulk form of apply_map_update.
414 : */
415 : static void
416 1372 : merge_map_updates(RelMapFile *map, const RelMapFile *updates, bool add_okay)
417 : {
418 : int32 i;
419 :
420 3656 : for (i = 0; i < updates->num_mappings; i++)
421 : {
422 2284 : apply_map_update(map,
423 : updates->mappings[i].mapoid,
424 : updates->mappings[i].mapfilenumber,
425 : add_okay);
426 : }
427 1372 : }
428 :
429 : /*
430 : * RelationMapRemoveMapping
431 : *
432 : * Remove a relation's entry in the map. This is only allowed for "active"
433 : * (but not committed) local mappings. We need it so we can back out the
434 : * entry for the transient target file when doing VACUUM FULL/CLUSTER on
435 : * a mapped relation.
436 : */
437 : void
438 168 : RelationMapRemoveMapping(Oid relationId)
439 : {
440 168 : RelMapFile *map = &active_local_updates;
441 : int32 i;
442 :
443 256 : for (i = 0; i < map->num_mappings; i++)
444 : {
445 256 : if (relationId == map->mappings[i].mapoid)
446 : {
447 : /* Found it, collapse it out */
448 168 : map->mappings[i] = map->mappings[map->num_mappings - 1];
449 168 : map->num_mappings--;
450 168 : return;
451 : }
452 : }
453 0 : elog(ERROR, "could not find temporary mapping for relation %u",
454 : relationId);
455 : }
456 :
457 : /*
458 : * RelationMapInvalidate
459 : *
460 : * This routine is invoked for SI cache flush messages. We must re-read
461 : * the indicated map file. However, we might receive a SI message in a
462 : * process that hasn't yet, and might never, load the mapping files;
463 : * for example the autovacuum launcher, which *must not* try to read
464 : * a local map since it is attached to no particular database.
465 : * So, re-read only if the map is valid now.
466 : */
467 : void
468 510 : RelationMapInvalidate(bool shared)
469 : {
470 510 : if (shared)
471 : {
472 268 : if (shared_map.magic == RELMAPPER_FILEMAGIC)
473 268 : load_relmap_file(true, false);
474 : }
475 : else
476 : {
477 242 : if (local_map.magic == RELMAPPER_FILEMAGIC)
478 242 : load_relmap_file(false, false);
479 : }
480 510 : }
481 :
482 : /*
483 : * RelationMapInvalidateAll
484 : *
485 : * Reload all map files. This is used to recover from SI message buffer
486 : * overflow: we can't be sure if we missed an inval message.
487 : * Again, reload only currently-valid maps.
488 : */
489 : void
490 4512 : RelationMapInvalidateAll(void)
491 : {
492 4512 : if (shared_map.magic == RELMAPPER_FILEMAGIC)
493 4512 : load_relmap_file(true, false);
494 4512 : if (local_map.magic == RELMAPPER_FILEMAGIC)
495 4382 : load_relmap_file(false, false);
496 4512 : }
497 :
498 : /*
499 : * AtCCI_RelationMap
500 : *
501 : * Activate any "pending" relation map updates at CommandCounterIncrement time.
502 : */
503 : void
504 1070870 : AtCCI_RelationMap(void)
505 : {
506 1070870 : if (pending_shared_updates.num_mappings != 0)
507 : {
508 572 : merge_map_updates(&active_shared_updates,
509 : &pending_shared_updates,
510 : true);
511 572 : pending_shared_updates.num_mappings = 0;
512 : }
513 1070870 : if (pending_local_updates.num_mappings != 0)
514 : {
515 458 : merge_map_updates(&active_local_updates,
516 : &pending_local_updates,
517 : true);
518 458 : pending_local_updates.num_mappings = 0;
519 : }
520 1070870 : }
521 :
522 : /*
523 : * AtEOXact_RelationMap
524 : *
525 : * Handle relation mapping at main-transaction commit or abort.
526 : *
527 : * During commit, this must be called as late as possible before the actual
528 : * transaction commit, so as to minimize the window where the transaction
529 : * could still roll back after committing map changes. Although nothing
530 : * critically bad happens in such a case, we still would prefer that it
531 : * not happen, since we'd possibly be losing useful updates to the relations'
532 : * pg_class row(s).
533 : *
534 : * During abort, we just have to throw away any pending map changes.
535 : * Normal post-abort cleanup will take care of fixing relcache entries.
536 : * Parallel worker commit/abort is handled by resetting active mappings
537 : * that may have been received from the leader process. (There should be
538 : * no pending updates in parallel workers.)
539 : */
540 : void
541 790508 : AtEOXact_RelationMap(bool isCommit, bool isParallelWorker)
542 : {
543 790508 : if (isCommit && !isParallelWorker)
544 : {
545 : /*
546 : * We should not get here with any "pending" updates. (We could
547 : * logically choose to treat such as committed, but in the current
548 : * code this should never happen.)
549 : */
550 : Assert(pending_shared_updates.num_mappings == 0);
551 : Assert(pending_local_updates.num_mappings == 0);
552 :
553 : /*
554 : * Write any active updates to the actual map files, then reset them.
555 : */
556 740686 : if (active_shared_updates.num_mappings != 0)
557 : {
558 214 : perform_relmap_update(true, &active_shared_updates);
559 214 : active_shared_updates.num_mappings = 0;
560 : }
561 740686 : if (active_local_updates.num_mappings != 0)
562 : {
563 128 : perform_relmap_update(false, &active_local_updates);
564 128 : active_local_updates.num_mappings = 0;
565 : }
566 : }
567 : else
568 : {
569 : /* Abort or parallel worker --- drop all local and pending updates */
570 : Assert(!isParallelWorker || pending_shared_updates.num_mappings == 0);
571 : Assert(!isParallelWorker || pending_local_updates.num_mappings == 0);
572 :
573 49822 : active_shared_updates.num_mappings = 0;
574 49822 : active_local_updates.num_mappings = 0;
575 49822 : pending_shared_updates.num_mappings = 0;
576 49822 : pending_local_updates.num_mappings = 0;
577 : }
578 790508 : }
579 :
580 : /*
581 : * AtPrepare_RelationMap
582 : *
583 : * Handle relation mapping at PREPARE.
584 : *
585 : * Currently, we don't support preparing any transaction that changes the map.
586 : */
587 : void
588 790 : AtPrepare_RelationMap(void)
589 : {
590 790 : if (active_shared_updates.num_mappings != 0 ||
591 790 : active_local_updates.num_mappings != 0 ||
592 790 : pending_shared_updates.num_mappings != 0 ||
593 790 : pending_local_updates.num_mappings != 0)
594 0 : ereport(ERROR,
595 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
596 : errmsg("cannot PREPARE a transaction that modified relation mapping")));
597 790 : }
598 :
599 : /*
600 : * CheckPointRelationMap
601 : *
602 : * This is called during a checkpoint. It must ensure that any relation map
603 : * updates that were WAL-logged before the start of the checkpoint are
604 : * securely flushed to disk and will not need to be replayed later. This
605 : * seems unlikely to be a performance-critical issue, so we use a simple
606 : * method: we just take and release the RelationMappingLock. This ensures
607 : * that any already-logged map update is complete, because write_relmap_file
608 : * will fsync the map file before the lock is released.
609 : */
610 : void
611 2474 : CheckPointRelationMap(void)
612 : {
613 2474 : LWLockAcquire(RelationMappingLock, LW_SHARED);
614 2474 : LWLockRelease(RelationMappingLock);
615 2474 : }
616 :
617 : /*
618 : * RelationMapFinishBootstrap
619 : *
620 : * Write out the initial relation mapping files at the completion of
621 : * bootstrap. All the mapped files should have been made known to us
622 : * via RelationMapUpdateMap calls.
623 : */
624 : void
625 90 : RelationMapFinishBootstrap(void)
626 : {
627 : Assert(IsBootstrapProcessingMode());
628 :
629 : /* Shouldn't be anything "pending" ... */
630 : Assert(active_shared_updates.num_mappings == 0);
631 : Assert(active_local_updates.num_mappings == 0);
632 : Assert(pending_shared_updates.num_mappings == 0);
633 : Assert(pending_local_updates.num_mappings == 0);
634 :
635 : /* Write the files; no WAL or sinval needed */
636 90 : LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
637 90 : write_relmap_file(&shared_map, false, false, false,
638 : InvalidOid, GLOBALTABLESPACE_OID, "global");
639 90 : write_relmap_file(&local_map, false, false, false,
640 : MyDatabaseId, MyDatabaseTableSpace, DatabasePath);
641 90 : LWLockRelease(RelationMappingLock);
642 90 : }
643 :
644 : /*
645 : * RelationMapInitialize
646 : *
647 : * This initializes the mapper module at process startup. We can't access the
648 : * database yet, so just make sure the maps are empty.
649 : */
650 : void
651 32998 : RelationMapInitialize(void)
652 : {
653 : /* The static variables should initialize to zeroes, but let's be sure */
654 32998 : shared_map.magic = 0; /* mark it not loaded */
655 32998 : local_map.magic = 0;
656 32998 : shared_map.num_mappings = 0;
657 32998 : local_map.num_mappings = 0;
658 32998 : active_shared_updates.num_mappings = 0;
659 32998 : active_local_updates.num_mappings = 0;
660 32998 : pending_shared_updates.num_mappings = 0;
661 32998 : pending_local_updates.num_mappings = 0;
662 32998 : }
663 :
664 : /*
665 : * RelationMapInitializePhase2
666 : *
667 : * This is called to prepare for access to pg_database during startup.
668 : * We should be able to read the shared map file now.
669 : */
670 : void
671 32998 : RelationMapInitializePhase2(void)
672 : {
673 : /*
674 : * In bootstrap mode, the map file isn't there yet, so do nothing.
675 : */
676 32998 : if (IsBootstrapProcessingMode())
677 90 : return;
678 :
679 : /*
680 : * Load the shared map file, die on error.
681 : */
682 32908 : load_relmap_file(true, false);
683 : }
684 :
685 : /*
686 : * RelationMapInitializePhase3
687 : *
688 : * This is called as soon as we have determined MyDatabaseId and set up
689 : * DatabasePath. At this point we should be able to read the local map file.
690 : */
691 : void
692 30490 : RelationMapInitializePhase3(void)
693 : {
694 : /*
695 : * In bootstrap mode, the map file isn't there yet, so do nothing.
696 : */
697 30490 : if (IsBootstrapProcessingMode())
698 90 : return;
699 :
700 : /*
701 : * Load the local map file, die on error.
702 : */
703 30400 : load_relmap_file(false, false);
704 : }
705 :
706 : /*
707 : * EstimateRelationMapSpace
708 : *
709 : * Estimate space needed to pass active shared and local relmaps to parallel
710 : * workers.
711 : */
712 : Size
713 892 : EstimateRelationMapSpace(void)
714 : {
715 892 : return sizeof(SerializedActiveRelMaps);
716 : }
717 :
718 : /*
719 : * SerializeRelationMap
720 : *
721 : * Serialize active shared and local relmap state for parallel workers.
722 : */
723 : void
724 892 : SerializeRelationMap(Size maxSize, char *startAddress)
725 : {
726 : SerializedActiveRelMaps *relmaps;
727 :
728 : Assert(maxSize >= EstimateRelationMapSpace());
729 :
730 892 : relmaps = (SerializedActiveRelMaps *) startAddress;
731 892 : relmaps->active_shared_updates = active_shared_updates;
732 892 : relmaps->active_local_updates = active_local_updates;
733 892 : }
734 :
735 : /*
736 : * RestoreRelationMap
737 : *
738 : * Restore active shared and local relmap state within a parallel worker.
739 : */
740 : void
741 2714 : RestoreRelationMap(char *startAddress)
742 : {
743 : SerializedActiveRelMaps *relmaps;
744 :
745 2714 : if (active_shared_updates.num_mappings != 0 ||
746 2714 : active_local_updates.num_mappings != 0 ||
747 2714 : pending_shared_updates.num_mappings != 0 ||
748 2714 : pending_local_updates.num_mappings != 0)
749 0 : elog(ERROR, "parallel worker has existing mappings");
750 :
751 2714 : relmaps = (SerializedActiveRelMaps *) startAddress;
752 2714 : active_shared_updates = relmaps->active_shared_updates;
753 2714 : active_local_updates = relmaps->active_local_updates;
754 2714 : }
755 :
756 : /*
757 : * load_relmap_file -- load the shared or local map file
758 : *
759 : * Because these files are essential for access to core system catalogs,
760 : * failure to load either of them is a fatal error.
761 : *
762 : * Note that the local case requires DatabasePath to be set up.
763 : */
764 : static void
765 73054 : load_relmap_file(bool shared, bool lock_held)
766 : {
767 73054 : if (shared)
768 37902 : read_relmap_file(&shared_map, "global", lock_held, FATAL);
769 : else
770 35152 : read_relmap_file(&local_map, DatabasePath, lock_held, FATAL);
771 73054 : }
772 :
773 : /*
774 : * read_relmap_file -- load data from any relation mapper file
775 : *
776 : * dbpath must be the relevant database path, or "global" for shared relations.
777 : *
778 : * RelationMappingLock will be acquired released unless lock_held = true.
779 : *
780 : * Errors will be reported at the indicated elevel, which should be at least
781 : * ERROR.
782 : */
783 : static void
784 81414 : read_relmap_file(RelMapFile *map, char *dbpath, bool lock_held, int elevel)
785 : {
786 : char mapfilename[MAXPGPATH];
787 : pg_crc32c crc;
788 : int fd;
789 : int r;
790 :
791 : Assert(elevel >= ERROR);
792 :
793 : /*
794 : * Grab the lock to prevent the file from being updated while we read it,
795 : * unless the caller is already holding the lock. If the file is updated
796 : * shortly after we look, the sinval signaling mechanism will make us
797 : * re-read it before we are able to access any relation that's affected by
798 : * the change.
799 : */
800 81414 : if (!lock_held)
801 81072 : LWLockAcquire(RelationMappingLock, LW_SHARED);
802 :
803 : /*
804 : * Open the target file.
805 : *
806 : * Because Windows isn't happy about the idea of renaming over a file that
807 : * someone has open, we only open this file after acquiring the lock, and
808 : * for the same reason, we close it before releasing the lock. That way,
809 : * by the time write_relmap_file() acquires an exclusive lock, no one else
810 : * will have it open.
811 : */
812 81414 : snprintf(mapfilename, sizeof(mapfilename), "%s/%s", dbpath,
813 : RELMAPPER_FILENAME);
814 81414 : fd = OpenTransientFile(mapfilename, O_RDONLY | PG_BINARY);
815 81414 : if (fd < 0)
816 0 : ereport(elevel,
817 : (errcode_for_file_access(),
818 : errmsg("could not open file \"%s\": %m",
819 : mapfilename)));
820 :
821 : /* Now read the data. */
822 81414 : pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_READ);
823 81414 : r = read(fd, map, sizeof(RelMapFile));
824 81414 : if (r != sizeof(RelMapFile))
825 : {
826 0 : if (r < 0)
827 0 : ereport(elevel,
828 : (errcode_for_file_access(),
829 : errmsg("could not read file \"%s\": %m", mapfilename)));
830 : else
831 0 : ereport(elevel,
832 : (errcode(ERRCODE_DATA_CORRUPTED),
833 : errmsg("could not read file \"%s\": read %d of %zu",
834 : mapfilename, r, sizeof(RelMapFile))));
835 : }
836 81414 : pgstat_report_wait_end();
837 :
838 81414 : if (CloseTransientFile(fd) != 0)
839 0 : ereport(elevel,
840 : (errcode_for_file_access(),
841 : errmsg("could not close file \"%s\": %m",
842 : mapfilename)));
843 :
844 81414 : if (!lock_held)
845 81072 : LWLockRelease(RelationMappingLock);
846 :
847 : /* check for correct magic number, etc */
848 81414 : if (map->magic != RELMAPPER_FILEMAGIC ||
849 81414 : map->num_mappings < 0 ||
850 81414 : map->num_mappings > MAX_MAPPINGS)
851 0 : ereport(elevel,
852 : (errmsg("relation mapping file \"%s\" contains invalid data",
853 : mapfilename)));
854 :
855 : /* verify the CRC */
856 81414 : INIT_CRC32C(crc);
857 81414 : COMP_CRC32C(crc, (char *) map, offsetof(RelMapFile, crc));
858 81414 : FIN_CRC32C(crc);
859 :
860 81414 : if (!EQ_CRC32C(crc, map->crc))
861 0 : ereport(elevel,
862 : (errmsg("relation mapping file \"%s\" contains incorrect checksum",
863 : mapfilename)));
864 81414 : }
865 :
866 : /*
867 : * Write out a new shared or local map file with the given contents.
868 : *
869 : * The magic number and CRC are automatically updated in *newmap. On
870 : * success, we copy the data to the appropriate permanent static variable.
871 : *
872 : * If write_wal is true then an appropriate WAL message is emitted.
873 : * (It will be false for bootstrap and WAL replay cases.)
874 : *
875 : * If send_sinval is true then a SI invalidation message is sent.
876 : * (This should be true except in bootstrap case.)
877 : *
878 : * If preserve_files is true then the storage manager is warned not to
879 : * delete the files listed in the map.
880 : *
881 : * Because this may be called during WAL replay when MyDatabaseId,
882 : * DatabasePath, etc aren't valid, we require the caller to pass in suitable
883 : * values. Pass dbpath as "global" for the shared map.
884 : *
885 : * The caller is also responsible for being sure no concurrent map update
886 : * could be happening.
887 : */
888 : static void
889 1016 : write_relmap_file(RelMapFile *newmap, bool write_wal, bool send_sinval,
890 : bool preserve_files, Oid dbid, Oid tsid, const char *dbpath)
891 : {
892 : int fd;
893 : char mapfilename[MAXPGPATH];
894 : char maptempfilename[MAXPGPATH];
895 :
896 : /*
897 : * Even without concurrent use of this map, CheckPointRelationMap() relies
898 : * on this locking. Without it, a restore of a base backup taken after
899 : * this function's XLogInsert() and before its durable_rename() would not
900 : * have the changes. wal_level=minimal doesn't need the lock, but this
901 : * isn't performance-critical enough for such a micro-optimization.
902 : */
903 : Assert(LWLockHeldByMeInMode(RelationMappingLock, LW_EXCLUSIVE));
904 :
905 : /*
906 : * Fill in the overhead fields and update CRC.
907 : */
908 1016 : newmap->magic = RELMAPPER_FILEMAGIC;
909 1016 : if (newmap->num_mappings < 0 || newmap->num_mappings > MAX_MAPPINGS)
910 0 : elog(ERROR, "attempt to write bogus relation mapping");
911 :
912 1016 : INIT_CRC32C(newmap->crc);
913 1016 : COMP_CRC32C(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc));
914 1016 : FIN_CRC32C(newmap->crc);
915 :
916 : /*
917 : * Construct filenames -- a temporary file that we'll create to write the
918 : * data initially, and then the permanent name to which we will rename it.
919 : */
920 1016 : snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
921 : dbpath, RELMAPPER_FILENAME);
922 1016 : snprintf(maptempfilename, sizeof(maptempfilename), "%s/%s",
923 : dbpath, RELMAPPER_TEMP_FILENAME);
924 :
925 : /*
926 : * Open a temporary file. If a file already exists with this name, it must
927 : * be left over from a previous crash, so we can overwrite it. Concurrent
928 : * calls to this function are not allowed.
929 : */
930 1016 : fd = OpenTransientFile(maptempfilename,
931 : O_WRONLY | O_CREAT | O_TRUNC | PG_BINARY);
932 1016 : if (fd < 0)
933 0 : ereport(ERROR,
934 : (errcode_for_file_access(),
935 : errmsg("could not open file \"%s\": %m",
936 : maptempfilename)));
937 :
938 : /* Write new data to the file. */
939 1016 : pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_WRITE);
940 1016 : if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile))
941 : {
942 : /* if write didn't set errno, assume problem is no disk space */
943 0 : if (errno == 0)
944 0 : errno = ENOSPC;
945 0 : ereport(ERROR,
946 : (errcode_for_file_access(),
947 : errmsg("could not write file \"%s\": %m",
948 : maptempfilename)));
949 : }
950 1016 : pgstat_report_wait_end();
951 :
952 : /* And close the file. */
953 1016 : if (CloseTransientFile(fd) != 0)
954 0 : ereport(ERROR,
955 : (errcode_for_file_access(),
956 : errmsg("could not close file \"%s\": %m",
957 : maptempfilename)));
958 :
959 1016 : if (write_wal)
960 : {
961 : xl_relmap_update xlrec;
962 : XLogRecPtr lsn;
963 :
964 : /* now errors are fatal ... */
965 782 : START_CRIT_SECTION();
966 :
967 782 : xlrec.dbid = dbid;
968 782 : xlrec.tsid = tsid;
969 782 : xlrec.nbytes = sizeof(RelMapFile);
970 :
971 782 : XLogBeginInsert();
972 782 : XLogRegisterData((char *) (&xlrec), MinSizeOfRelmapUpdate);
973 782 : XLogRegisterData((char *) newmap, sizeof(RelMapFile));
974 :
975 782 : lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE);
976 :
977 : /* As always, WAL must hit the disk before the data update does */
978 782 : XLogFlush(lsn);
979 : }
980 :
981 : /*
982 : * durable_rename() does all the hard work of making sure that we rename
983 : * the temporary file into place in a crash-safe manner.
984 : *
985 : * NB: Although we instruct durable_rename() to use ERROR, we will often
986 : * be in a critical section at this point; if so, ERROR will become PANIC.
987 : */
988 1016 : pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_REPLACE);
989 1016 : durable_rename(maptempfilename, mapfilename, ERROR);
990 1016 : pgstat_report_wait_end();
991 :
992 : /*
993 : * Now that the file is safely on disk, send sinval message to let other
994 : * backends know to re-read it. We must do this inside the critical
995 : * section: if for some reason we fail to send the message, we have to
996 : * force a database-wide PANIC. Otherwise other backends might continue
997 : * execution with stale mapping information, which would be catastrophic
998 : * as soon as others began to use the now-committed data.
999 : */
1000 1016 : if (send_sinval)
1001 396 : CacheInvalidateRelmap(dbid);
1002 :
1003 : /*
1004 : * Make sure that the files listed in the map are not deleted if the outer
1005 : * transaction aborts. This had better be within the critical section
1006 : * too: it's not likely to fail, but if it did, we'd arrive at transaction
1007 : * abort with the files still vulnerable. PANICing will leave things in a
1008 : * good state on-disk.
1009 : *
1010 : * Note: we're cheating a little bit here by assuming that mapped files
1011 : * are either in pg_global or the database's default tablespace.
1012 : */
1013 1016 : if (preserve_files)
1014 : {
1015 : int32 i;
1016 :
1017 12790 : for (i = 0; i < newmap->num_mappings; i++)
1018 : {
1019 : RelFileLocator rlocator;
1020 :
1021 12448 : rlocator.spcOid = tsid;
1022 12448 : rlocator.dbOid = dbid;
1023 12448 : rlocator.relNumber = newmap->mappings[i].mapfilenumber;
1024 12448 : RelationPreserveStorage(rlocator, false);
1025 : }
1026 : }
1027 :
1028 : /* Critical section done */
1029 1016 : if (write_wal)
1030 782 : END_CRIT_SECTION();
1031 1016 : }
1032 :
1033 : /*
1034 : * Merge the specified updates into the appropriate "real" map,
1035 : * and write out the changes. This function must be used for committing
1036 : * updates during normal multiuser operation.
1037 : */
1038 : static void
1039 342 : perform_relmap_update(bool shared, const RelMapFile *updates)
1040 : {
1041 : RelMapFile newmap;
1042 :
1043 : /*
1044 : * Anyone updating a relation's mapping info should take exclusive lock on
1045 : * that rel and hold it until commit. This ensures that there will not be
1046 : * concurrent updates on the same mapping value; but there could easily be
1047 : * concurrent updates on different values in the same file. We cover that
1048 : * by acquiring the RelationMappingLock, re-reading the target file to
1049 : * ensure it's up to date, applying the updates, and writing the data
1050 : * before releasing RelationMappingLock.
1051 : *
1052 : * There is only one RelationMappingLock. In principle we could try to
1053 : * have one per mapping file, but it seems unlikely to be worth the
1054 : * trouble.
1055 : */
1056 342 : LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
1057 :
1058 : /* Be certain we see any other updates just made */
1059 342 : load_relmap_file(shared, true);
1060 :
1061 : /* Prepare updated data in a local variable */
1062 342 : if (shared)
1063 214 : memcpy(&newmap, &shared_map, sizeof(RelMapFile));
1064 : else
1065 128 : memcpy(&newmap, &local_map, sizeof(RelMapFile));
1066 :
1067 : /*
1068 : * Apply the updates to newmap. No new mappings should appear, unless
1069 : * somebody is adding indexes to system catalogs.
1070 : */
1071 342 : merge_map_updates(&newmap, updates, allowSystemTableMods);
1072 :
1073 : /* Write out the updated map and do other necessary tasks */
1074 342 : write_relmap_file(&newmap, true, true, true,
1075 : (shared ? InvalidOid : MyDatabaseId),
1076 : (shared ? GLOBALTABLESPACE_OID : MyDatabaseTableSpace),
1077 : (shared ? "global" : DatabasePath));
1078 :
1079 : /*
1080 : * We successfully wrote the updated file, so it's now safe to rely on the
1081 : * new values in this process, too.
1082 : */
1083 342 : if (shared)
1084 214 : memcpy(&shared_map, &newmap, sizeof(RelMapFile));
1085 : else
1086 128 : memcpy(&local_map, &newmap, sizeof(RelMapFile));
1087 :
1088 : /* Now we can release the lock */
1089 342 : LWLockRelease(RelationMappingLock);
1090 342 : }
1091 :
1092 : /*
1093 : * RELMAP resource manager's routines
1094 : */
1095 : void
1096 54 : relmap_redo(XLogReaderState *record)
1097 : {
1098 54 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
1099 :
1100 : /* Backup blocks are not used in relmap records */
1101 : Assert(!XLogRecHasAnyBlockRefs(record));
1102 :
1103 54 : if (info == XLOG_RELMAP_UPDATE)
1104 : {
1105 54 : xl_relmap_update *xlrec = (xl_relmap_update *) XLogRecGetData(record);
1106 : RelMapFile newmap;
1107 : char *dbpath;
1108 :
1109 54 : if (xlrec->nbytes != sizeof(RelMapFile))
1110 0 : elog(PANIC, "relmap_redo: wrong size %u in relmap update record",
1111 : xlrec->nbytes);
1112 54 : memcpy(&newmap, xlrec->data, sizeof(newmap));
1113 :
1114 : /* We need to construct the pathname for this database */
1115 54 : dbpath = GetDatabasePath(xlrec->dbid, xlrec->tsid);
1116 :
1117 : /*
1118 : * Write out the new map and send sinval, but of course don't write a
1119 : * new WAL entry. There's no surrounding transaction to tell to
1120 : * preserve files, either.
1121 : *
1122 : * There shouldn't be anyone else updating relmaps during WAL replay,
1123 : * but grab the lock to interlock against load_relmap_file().
1124 : *
1125 : * Note that we use the same WAL record for updating the relmap of an
1126 : * existing database as we do for creating a new database. In the
1127 : * latter case, taking the relmap log and sending sinval messages is
1128 : * unnecessary, but harmless. If we wanted to avoid it, we could add a
1129 : * flag to the WAL record to indicate which operation is being
1130 : * performed.
1131 : */
1132 54 : LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
1133 54 : write_relmap_file(&newmap, false, true, false,
1134 : xlrec->dbid, xlrec->tsid, dbpath);
1135 54 : LWLockRelease(RelationMappingLock);
1136 :
1137 54 : pfree(dbpath);
1138 : }
1139 : else
1140 0 : elog(PANIC, "relmap_redo: unknown op code %u", info);
1141 54 : }
|