Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * relmapper.c
4 : * Catalog-to-filenumber mapping
5 : *
6 : * For most tables, the physical file underlying the table is specified by
7 : * pg_class.relfilenode. However, that obviously won't work for pg_class
8 : * itself, nor for the other "nailed" catalogs for which we have to be able
9 : * to set up working Relation entries without access to pg_class. It also
10 : * does not work for shared catalogs, since there is no practical way to
11 : * update other databases' pg_class entries when relocating a shared catalog.
12 : * Therefore, for these special catalogs (henceforth referred to as "mapped
13 : * catalogs") we rely on a separately maintained file that shows the mapping
14 : * from catalog OIDs to filenumbers. Each database has a map file for
15 : * its local mapped catalogs, and there is a separate map file for shared
16 : * catalogs. Mapped catalogs have zero in their pg_class.relfilenode entries.
17 : *
18 : * Relocation of a normal table is committed (ie, the new physical file becomes
19 : * authoritative) when the pg_class row update commits. For mapped catalogs,
20 : * the act of updating the map file is effectively commit of the relocation.
21 : * We postpone the file update till just before commit of the transaction
22 : * doing the rewrite, but there is necessarily a window between. Therefore
23 : * mapped catalogs can only be relocated by operations such as VACUUM FULL
24 : * and CLUSTER, which make no transactionally-significant changes: it must be
25 : * safe for the new file to replace the old, even if the transaction itself
26 : * aborts. An important factor here is that the indexes and toast table of
27 : * a mapped catalog must also be mapped, so that the rewrites/relocations of
28 : * all these files commit in a single map file update rather than being tied
29 : * to transaction commit.
30 : *
31 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
32 : * Portions Copyright (c) 1994, Regents of the University of California
33 : *
34 : *
35 : * IDENTIFICATION
36 : * src/backend/utils/cache/relmapper.c
37 : *
38 : *-------------------------------------------------------------------------
39 : */
40 : #include "postgres.h"
41 :
42 : #include <fcntl.h>
43 : #include <sys/stat.h>
44 : #include <unistd.h>
45 :
46 : #include "access/xact.h"
47 : #include "access/xlog.h"
48 : #include "access/xloginsert.h"
49 : #include "catalog/pg_tablespace.h"
50 : #include "catalog/storage.h"
51 : #include "miscadmin.h"
52 : #include "pgstat.h"
53 : #include "storage/fd.h"
54 : #include "storage/lwlock.h"
55 : #include "utils/inval.h"
56 : #include "utils/relmapper.h"
57 :
58 :
59 : /*
60 : * The map file is critical data: we have no automatic method for recovering
61 : * from loss or corruption of it. We use a CRC so that we can detect
62 : * corruption. Since the file might be more than one standard-size disk
63 : * sector in size, we cannot rely on overwrite-in-place. Instead, we generate
64 : * a new file and rename it into place, atomically replacing the original file.
65 : *
66 : * Entries in the mappings[] array are in no particular order. We could
67 : * speed searching by insisting on OID order, but it really shouldn't be
68 : * worth the trouble given the intended size of the mapping sets.
69 : */
70 : #define RELMAPPER_FILENAME "pg_filenode.map"
71 : #define RELMAPPER_TEMP_FILENAME "pg_filenode.map.tmp"
72 :
73 : #define RELMAPPER_FILEMAGIC 0x592717 /* version ID value */
74 :
75 : /*
76 : * There's no need for this constant to have any particular value, and we
77 : * can raise it as necessary if we end up with more mapped relations. For
78 : * now, we just pick a round number that is modestly larger than the expected
79 : * number of mappings.
80 : */
81 : #define MAX_MAPPINGS 64
82 :
83 : typedef struct RelMapping
84 : {
85 : Oid mapoid; /* OID of a catalog */
86 : RelFileNumber mapfilenumber; /* its rel file number */
87 : } RelMapping;
88 :
89 : typedef struct RelMapFile
90 : {
91 : int32 magic; /* always RELMAPPER_FILEMAGIC */
92 : int32 num_mappings; /* number of valid RelMapping entries */
93 : RelMapping mappings[MAX_MAPPINGS];
94 : pg_crc32c crc; /* CRC of all above */
95 : } RelMapFile;
96 :
97 : /*
98 : * State for serializing local and shared relmappings for parallel workers
99 : * (active states only). See notes on active_* and pending_* updates state.
100 : */
101 : typedef struct SerializedActiveRelMaps
102 : {
103 : RelMapFile active_shared_updates;
104 : RelMapFile active_local_updates;
105 : } SerializedActiveRelMaps;
106 :
107 : /*
108 : * The currently known contents of the shared map file and our database's
109 : * local map file are stored here. These can be reloaded from disk
110 : * immediately whenever we receive an update sinval message.
111 : */
112 : static RelMapFile shared_map;
113 : static RelMapFile local_map;
114 :
115 : /*
116 : * We use the same RelMapFile data structure to track uncommitted local
117 : * changes in the mappings (but note the magic and crc fields are not made
118 : * valid in these variables). Currently, map updates are not allowed within
119 : * subtransactions, so one set of transaction-level changes is sufficient.
120 : *
121 : * The active_xxx variables contain updates that are valid in our transaction
122 : * and should be honored by RelationMapOidToFilenumber. The pending_xxx
123 : * variables contain updates we have been told about that aren't active yet;
124 : * they will become active at the next CommandCounterIncrement. This setup
125 : * lets map updates act similarly to updates of pg_class rows, ie, they
126 : * become visible only at the next CommandCounterIncrement boundary.
127 : *
128 : * Active shared and active local updates are serialized by the parallel
129 : * infrastructure, and deserialized within parallel workers.
130 : */
131 : static RelMapFile active_shared_updates;
132 : static RelMapFile active_local_updates;
133 : static RelMapFile pending_shared_updates;
134 : static RelMapFile pending_local_updates;
135 :
136 :
137 : /* non-export function prototypes */
138 : static void apply_map_update(RelMapFile *map, Oid relationId,
139 : RelFileNumber fileNumber, bool add_okay);
140 : static void merge_map_updates(RelMapFile *map, const RelMapFile *updates,
141 : bool add_okay);
142 : static void load_relmap_file(bool shared, bool lock_held);
143 : static void read_relmap_file(RelMapFile *map, char *dbpath, bool lock_held,
144 : int elevel);
145 : static void write_relmap_file(RelMapFile *newmap, bool write_wal,
146 : bool send_sinval, bool preserve_files,
147 : Oid dbid, Oid tsid, const char *dbpath);
148 : static void perform_relmap_update(bool shared, const RelMapFile *updates);
149 :
150 :
151 : /*
152 : * RelationMapOidToFilenumber
153 : *
154 : * The raison d' etre ... given a relation OID, look up its filenumber.
155 : *
156 : * Although shared and local relation OIDs should never overlap, the caller
157 : * always knows which we need --- so pass that information to avoid useless
158 : * searching.
159 : *
160 : * Returns InvalidRelFileNumber if the OID is not known (which should never
161 : * happen, but the caller is in a better position to report a meaningful
162 : * error).
163 : */
164 : RelFileNumber
165 1374510 : RelationMapOidToFilenumber(Oid relationId, bool shared)
166 : {
167 : const RelMapFile *map;
168 : int32 i;
169 :
170 : /* If there are active updates, believe those over the main maps */
171 1374510 : if (shared)
172 : {
173 842192 : map = &active_shared_updates;
174 845002 : for (i = 0; i < map->num_mappings; i++)
175 : {
176 4216 : if (relationId == map->mappings[i].mapoid)
177 1406 : return map->mappings[i].mapfilenumber;
178 : }
179 840786 : map = &shared_map;
180 20924162 : for (i = 0; i < map->num_mappings; i++)
181 : {
182 20924162 : if (relationId == map->mappings[i].mapoid)
183 840786 : return map->mappings[i].mapfilenumber;
184 : }
185 : }
186 : else
187 : {
188 532318 : map = &active_local_updates;
189 536552 : for (i = 0; i < map->num_mappings; i++)
190 : {
191 6598 : if (relationId == map->mappings[i].mapoid)
192 2364 : return map->mappings[i].mapfilenumber;
193 : }
194 529954 : map = &local_map;
195 3929092 : for (i = 0; i < map->num_mappings; i++)
196 : {
197 3929092 : if (relationId == map->mappings[i].mapoid)
198 529954 : return map->mappings[i].mapfilenumber;
199 : }
200 : }
201 :
202 0 : return InvalidRelFileNumber;
203 : }
204 :
205 : /*
206 : * RelationMapFilenumberToOid
207 : *
208 : * Do the reverse of the normal direction of mapping done in
209 : * RelationMapOidToFilenumber.
210 : *
211 : * This is not supposed to be used during normal running but rather for
212 : * information purposes when looking at the filesystem or xlog.
213 : *
214 : * Returns InvalidOid if the OID is not known; this can easily happen if the
215 : * relfilenumber doesn't pertain to a mapped relation.
216 : */
217 : Oid
218 1066 : RelationMapFilenumberToOid(RelFileNumber filenumber, bool shared)
219 : {
220 : const RelMapFile *map;
221 : int32 i;
222 :
223 : /* If there are active updates, believe those over the main maps */
224 1066 : if (shared)
225 : {
226 306 : map = &active_shared_updates;
227 306 : for (i = 0; i < map->num_mappings; i++)
228 : {
229 0 : if (filenumber == map->mappings[i].mapfilenumber)
230 0 : return map->mappings[i].mapoid;
231 : }
232 306 : map = &shared_map;
233 7442 : for (i = 0; i < map->num_mappings; i++)
234 : {
235 7442 : if (filenumber == map->mappings[i].mapfilenumber)
236 306 : return map->mappings[i].mapoid;
237 : }
238 : }
239 : else
240 : {
241 760 : map = &active_local_updates;
242 760 : for (i = 0; i < map->num_mappings; i++)
243 : {
244 0 : if (filenumber == map->mappings[i].mapfilenumber)
245 0 : return map->mappings[i].mapoid;
246 : }
247 760 : map = &local_map;
248 4182 : for (i = 0; i < map->num_mappings; i++)
249 : {
250 4070 : if (filenumber == map->mappings[i].mapfilenumber)
251 648 : return map->mappings[i].mapoid;
252 : }
253 : }
254 :
255 112 : return InvalidOid;
256 : }
257 :
258 : /*
259 : * RelationMapOidToFilenumberForDatabase
260 : *
261 : * Like RelationMapOidToFilenumber, but reads the mapping from the indicated
262 : * path instead of using the one for the current database.
263 : */
264 : RelFileNumber
265 7812 : RelationMapOidToFilenumberForDatabase(char *dbpath, Oid relationId)
266 : {
267 : RelMapFile map;
268 : int i;
269 :
270 : /* Read the relmap file from the source database. */
271 7812 : read_relmap_file(&map, dbpath, false, ERROR);
272 :
273 : /* Iterate over the relmap entries to find the input relation OID. */
274 66836 : for (i = 0; i < map.num_mappings; i++)
275 : {
276 66836 : if (relationId == map.mappings[i].mapoid)
277 7812 : return map.mappings[i].mapfilenumber;
278 : }
279 :
280 0 : return InvalidRelFileNumber;
281 : }
282 :
283 : /*
284 : * RelationMapCopy
285 : *
286 : * Copy relmapfile from source db path to the destination db path and WAL log
287 : * the operation. This is intended for use in creating a new relmap file
288 : * for a database that doesn't have one yet, not for replacing an existing
289 : * relmap file.
290 : */
291 : void
292 434 : RelationMapCopy(Oid dbid, Oid tsid, char *srcdbpath, char *dstdbpath)
293 : {
294 : RelMapFile map;
295 :
296 : /*
297 : * Read the relmap file from the source database.
298 : */
299 434 : read_relmap_file(&map, srcdbpath, false, ERROR);
300 :
301 : /*
302 : * Write the same data into the destination database's relmap file.
303 : *
304 : * No sinval is needed because no one can be connected to the destination
305 : * database yet.
306 : *
307 : * There's no point in trying to preserve files here. The new database
308 : * isn't usable yet anyway, and won't ever be if we can't install a relmap
309 : * file.
310 : */
311 434 : LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
312 434 : write_relmap_file(&map, true, false, false, dbid, tsid, dstdbpath);
313 434 : LWLockRelease(RelationMappingLock);
314 434 : }
315 :
316 : /*
317 : * RelationMapUpdateMap
318 : *
319 : * Install a new relfilenumber mapping for the specified relation.
320 : *
321 : * If immediate is true (or we're bootstrapping), the mapping is activated
322 : * immediately. Otherwise it is made pending until CommandCounterIncrement.
323 : */
324 : void
325 7700 : RelationMapUpdateMap(Oid relationId, RelFileNumber fileNumber, bool shared,
326 : bool immediate)
327 : {
328 : RelMapFile *map;
329 :
330 7700 : if (IsBootstrapProcessingMode())
331 : {
332 : /*
333 : * In bootstrap mode, the mapping gets installed in permanent map.
334 : */
335 6210 : if (shared)
336 4320 : map = &shared_map;
337 : else
338 1890 : map = &local_map;
339 : }
340 : else
341 : {
342 : /*
343 : * We don't currently support map changes within subtransactions, or
344 : * when in parallel mode. This could be done with more bookkeeping
345 : * infrastructure, but it doesn't presently seem worth it.
346 : */
347 1490 : if (GetCurrentTransactionNestLevel() > 1)
348 0 : elog(ERROR, "cannot change relation mapping within subtransaction");
349 :
350 1490 : if (IsInParallelMode())
351 0 : elog(ERROR, "cannot change relation mapping in parallel mode");
352 :
353 1490 : if (immediate)
354 : {
355 : /* Make it active, but only locally */
356 168 : if (shared)
357 0 : map = &active_shared_updates;
358 : else
359 168 : map = &active_local_updates;
360 : }
361 : else
362 : {
363 : /* Make it pending */
364 1322 : if (shared)
365 686 : map = &pending_shared_updates;
366 : else
367 636 : map = &pending_local_updates;
368 : }
369 : }
370 7700 : apply_map_update(map, relationId, fileNumber, true);
371 7700 : }
372 :
373 : /*
374 : * apply_map_update
375 : *
376 : * Insert a new mapping into the given map variable, replacing any existing
377 : * mapping for the same relation.
378 : *
379 : * In some cases the caller knows there must be an existing mapping; pass
380 : * add_okay = false to draw an error if not.
381 : */
382 : static void
383 10144 : apply_map_update(RelMapFile *map, Oid relationId, RelFileNumber fileNumber,
384 : bool add_okay)
385 : {
386 : int32 i;
387 :
388 : /* Replace any existing mapping */
389 151678 : for (i = 0; i < map->num_mappings; i++)
390 : {
391 143208 : if (relationId == map->mappings[i].mapoid)
392 : {
393 1674 : map->mappings[i].mapfilenumber = fileNumber;
394 1674 : return;
395 : }
396 : }
397 :
398 : /* Nope, need to add a new mapping */
399 8470 : if (!add_okay)
400 0 : elog(ERROR, "attempt to apply a mapping to unmapped relation %u",
401 : relationId);
402 8470 : if (map->num_mappings >= MAX_MAPPINGS)
403 0 : elog(ERROR, "ran out of space in relation map");
404 8470 : map->mappings[map->num_mappings].mapoid = relationId;
405 8470 : map->mappings[map->num_mappings].mapfilenumber = fileNumber;
406 8470 : map->num_mappings++;
407 : }
408 :
409 : /*
410 : * merge_map_updates
411 : *
412 : * Merge all the updates in the given pending-update map into the target map.
413 : * This is just a bulk form of apply_map_update.
414 : */
415 : static void
416 1482 : merge_map_updates(RelMapFile *map, const RelMapFile *updates, bool add_okay)
417 : {
418 : int32 i;
419 :
420 3926 : for (i = 0; i < updates->num_mappings; i++)
421 : {
422 2444 : apply_map_update(map,
423 : updates->mappings[i].mapoid,
424 : updates->mappings[i].mapfilenumber,
425 : add_okay);
426 : }
427 1482 : }
428 :
429 : /*
430 : * RelationMapRemoveMapping
431 : *
432 : * Remove a relation's entry in the map. This is only allowed for "active"
433 : * (but not committed) local mappings. We need it so we can back out the
434 : * entry for the transient target file when doing VACUUM FULL/CLUSTER on
435 : * a mapped relation.
436 : */
437 : void
438 168 : RelationMapRemoveMapping(Oid relationId)
439 : {
440 168 : RelMapFile *map = &active_local_updates;
441 : int32 i;
442 :
443 256 : for (i = 0; i < map->num_mappings; i++)
444 : {
445 256 : if (relationId == map->mappings[i].mapoid)
446 : {
447 : /* Found it, collapse it out */
448 168 : map->mappings[i] = map->mappings[map->num_mappings - 1];
449 168 : map->num_mappings--;
450 168 : return;
451 : }
452 : }
453 0 : elog(ERROR, "could not find temporary mapping for relation %u",
454 : relationId);
455 : }
456 :
457 : /*
458 : * RelationMapInvalidate
459 : *
460 : * This routine is invoked for SI cache flush messages. We must re-read
461 : * the indicated map file. However, we might receive a SI message in a
462 : * process that hasn't yet, and might never, load the mapping files;
463 : * for example the autovacuum launcher, which *must not* try to read
464 : * a local map since it is attached to no particular database.
465 : * So, re-read only if the map is valid now.
466 : */
467 : void
468 544 : RelationMapInvalidate(bool shared)
469 : {
470 544 : if (shared)
471 : {
472 292 : if (shared_map.magic == RELMAPPER_FILEMAGIC)
473 292 : load_relmap_file(true, false);
474 : }
475 : else
476 : {
477 252 : if (local_map.magic == RELMAPPER_FILEMAGIC)
478 252 : load_relmap_file(false, false);
479 : }
480 544 : }
481 :
482 : /*
483 : * RelationMapInvalidateAll
484 : *
485 : * Reload all map files. This is used to recover from SI message buffer
486 : * overflow: we can't be sure if we missed an inval message.
487 : * Again, reload only currently-valid maps.
488 : */
489 : void
490 4392 : RelationMapInvalidateAll(void)
491 : {
492 4392 : if (shared_map.magic == RELMAPPER_FILEMAGIC)
493 4392 : load_relmap_file(true, false);
494 4392 : if (local_map.magic == RELMAPPER_FILEMAGIC)
495 4274 : load_relmap_file(false, false);
496 4392 : }
497 :
498 : /*
499 : * AtCCI_RelationMap
500 : *
501 : * Activate any "pending" relation map updates at CommandCounterIncrement time.
502 : */
503 : void
504 1066954 : AtCCI_RelationMap(void)
505 : {
506 1066954 : if (pending_shared_updates.num_mappings != 0)
507 : {
508 630 : merge_map_updates(&active_shared_updates,
509 : &pending_shared_updates,
510 : true);
511 630 : pending_shared_updates.num_mappings = 0;
512 : }
513 1066954 : if (pending_local_updates.num_mappings != 0)
514 : {
515 480 : merge_map_updates(&active_local_updates,
516 : &pending_local_updates,
517 : true);
518 480 : pending_local_updates.num_mappings = 0;
519 : }
520 1066954 : }
521 :
522 : /*
523 : * AtEOXact_RelationMap
524 : *
525 : * Handle relation mapping at main-transaction commit or abort.
526 : *
527 : * During commit, this must be called as late as possible before the actual
528 : * transaction commit, so as to minimize the window where the transaction
529 : * could still roll back after committing map changes. Although nothing
530 : * critically bad happens in such a case, we still would prefer that it
531 : * not happen, since we'd possibly be losing useful updates to the relations'
532 : * pg_class row(s).
533 : *
534 : * During abort, we just have to throw away any pending map changes.
535 : * Normal post-abort cleanup will take care of fixing relcache entries.
536 : * Parallel worker commit/abort is handled by resetting active mappings
537 : * that may have been received from the leader process. (There should be
538 : * no pending updates in parallel workers.)
539 : */
540 : void
541 748428 : AtEOXact_RelationMap(bool isCommit, bool isParallelWorker)
542 : {
543 748428 : if (isCommit && !isParallelWorker)
544 : {
545 : /*
546 : * We should not get here with any "pending" updates. (We could
547 : * logically choose to treat such as committed, but in the current
548 : * code this should never happen.)
549 : */
550 : Assert(pending_shared_updates.num_mappings == 0);
551 : Assert(pending_local_updates.num_mappings == 0);
552 :
553 : /*
554 : * Write any active updates to the actual map files, then reset them.
555 : */
556 699160 : if (active_shared_updates.num_mappings != 0)
557 : {
558 236 : perform_relmap_update(true, &active_shared_updates);
559 236 : active_shared_updates.num_mappings = 0;
560 : }
561 699160 : if (active_local_updates.num_mappings != 0)
562 : {
563 136 : perform_relmap_update(false, &active_local_updates);
564 136 : active_local_updates.num_mappings = 0;
565 : }
566 : }
567 : else
568 : {
569 : /* Abort or parallel worker --- drop all local and pending updates */
570 : Assert(!isParallelWorker || pending_shared_updates.num_mappings == 0);
571 : Assert(!isParallelWorker || pending_local_updates.num_mappings == 0);
572 :
573 49268 : active_shared_updates.num_mappings = 0;
574 49268 : active_local_updates.num_mappings = 0;
575 49268 : pending_shared_updates.num_mappings = 0;
576 49268 : pending_local_updates.num_mappings = 0;
577 : }
578 748428 : }
579 :
580 : /*
581 : * AtPrepare_RelationMap
582 : *
583 : * Handle relation mapping at PREPARE.
584 : *
585 : * Currently, we don't support preparing any transaction that changes the map.
586 : */
587 : void
588 768 : AtPrepare_RelationMap(void)
589 : {
590 768 : if (active_shared_updates.num_mappings != 0 ||
591 768 : active_local_updates.num_mappings != 0 ||
592 768 : pending_shared_updates.num_mappings != 0 ||
593 768 : pending_local_updates.num_mappings != 0)
594 0 : ereport(ERROR,
595 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
596 : errmsg("cannot PREPARE a transaction that modified relation mapping")));
597 768 : }
598 :
599 : /*
600 : * CheckPointRelationMap
601 : *
602 : * This is called during a checkpoint. It must ensure that any relation map
603 : * updates that were WAL-logged before the start of the checkpoint are
604 : * securely flushed to disk and will not need to be replayed later. This
605 : * seems unlikely to be a performance-critical issue, so we use a simple
606 : * method: we just take and release the RelationMappingLock. This ensures
607 : * that any already-logged map update is complete, because write_relmap_file
608 : * will fsync the map file before the lock is released.
609 : */
610 : void
611 2476 : CheckPointRelationMap(void)
612 : {
613 2476 : LWLockAcquire(RelationMappingLock, LW_SHARED);
614 2476 : LWLockRelease(RelationMappingLock);
615 2476 : }
616 :
617 : /*
618 : * RelationMapFinishBootstrap
619 : *
620 : * Write out the initial relation mapping files at the completion of
621 : * bootstrap. All the mapped files should have been made known to us
622 : * via RelationMapUpdateMap calls.
623 : */
624 : void
625 90 : RelationMapFinishBootstrap(void)
626 : {
627 : Assert(IsBootstrapProcessingMode());
628 :
629 : /* Shouldn't be anything "pending" ... */
630 : Assert(active_shared_updates.num_mappings == 0);
631 : Assert(active_local_updates.num_mappings == 0);
632 : Assert(pending_shared_updates.num_mappings == 0);
633 : Assert(pending_local_updates.num_mappings == 0);
634 :
635 : /* Write the files; no WAL or sinval needed */
636 90 : LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
637 90 : write_relmap_file(&shared_map, false, false, false,
638 : InvalidOid, GLOBALTABLESPACE_OID, "global");
639 90 : write_relmap_file(&local_map, false, false, false,
640 : MyDatabaseId, MyDatabaseTableSpace, DatabasePath);
641 90 : LWLockRelease(RelationMappingLock);
642 90 : }
643 :
644 : /*
645 : * RelationMapInitialize
646 : *
647 : * This initializes the mapper module at process startup. We can't access the
648 : * database yet, so just make sure the maps are empty.
649 : */
650 : void
651 31284 : RelationMapInitialize(void)
652 : {
653 : /* The static variables should initialize to zeroes, but let's be sure */
654 31284 : shared_map.magic = 0; /* mark it not loaded */
655 31284 : local_map.magic = 0;
656 31284 : shared_map.num_mappings = 0;
657 31284 : local_map.num_mappings = 0;
658 31284 : active_shared_updates.num_mappings = 0;
659 31284 : active_local_updates.num_mappings = 0;
660 31284 : pending_shared_updates.num_mappings = 0;
661 31284 : pending_local_updates.num_mappings = 0;
662 31284 : }
663 :
664 : /*
665 : * RelationMapInitializePhase2
666 : *
667 : * This is called to prepare for access to pg_database during startup.
668 : * We should be able to read the shared map file now.
669 : */
670 : void
671 31284 : RelationMapInitializePhase2(void)
672 : {
673 : /*
674 : * In bootstrap mode, the map file isn't there yet, so do nothing.
675 : */
676 31284 : if (IsBootstrapProcessingMode())
677 90 : return;
678 :
679 : /*
680 : * Load the shared map file, die on error.
681 : */
682 31194 : load_relmap_file(true, false);
683 : }
684 :
685 : /*
686 : * RelationMapInitializePhase3
687 : *
688 : * This is called as soon as we have determined MyDatabaseId and set up
689 : * DatabasePath. At this point we should be able to read the local map file.
690 : */
691 : void
692 28786 : RelationMapInitializePhase3(void)
693 : {
694 : /*
695 : * In bootstrap mode, the map file isn't there yet, so do nothing.
696 : */
697 28786 : if (IsBootstrapProcessingMode())
698 90 : return;
699 :
700 : /*
701 : * Load the local map file, die on error.
702 : */
703 28696 : load_relmap_file(false, false);
704 : }
705 :
706 : /*
707 : * EstimateRelationMapSpace
708 : *
709 : * Estimate space needed to pass active shared and local relmaps to parallel
710 : * workers.
711 : */
712 : Size
713 886 : EstimateRelationMapSpace(void)
714 : {
715 886 : return sizeof(SerializedActiveRelMaps);
716 : }
717 :
718 : /*
719 : * SerializeRelationMap
720 : *
721 : * Serialize active shared and local relmap state for parallel workers.
722 : */
723 : void
724 886 : SerializeRelationMap(Size maxSize, char *startAddress)
725 : {
726 : SerializedActiveRelMaps *relmaps;
727 :
728 : Assert(maxSize >= EstimateRelationMapSpace());
729 :
730 886 : relmaps = (SerializedActiveRelMaps *) startAddress;
731 886 : relmaps->active_shared_updates = active_shared_updates;
732 886 : relmaps->active_local_updates = active_local_updates;
733 886 : }
734 :
735 : /*
736 : * RestoreRelationMap
737 : *
738 : * Restore active shared and local relmap state within a parallel worker.
739 : */
740 : void
741 2712 : RestoreRelationMap(char *startAddress)
742 : {
743 : SerializedActiveRelMaps *relmaps;
744 :
745 2712 : if (active_shared_updates.num_mappings != 0 ||
746 2712 : active_local_updates.num_mappings != 0 ||
747 2712 : pending_shared_updates.num_mappings != 0 ||
748 2712 : pending_local_updates.num_mappings != 0)
749 0 : elog(ERROR, "parallel worker has existing mappings");
750 :
751 2712 : relmaps = (SerializedActiveRelMaps *) startAddress;
752 2712 : active_shared_updates = relmaps->active_shared_updates;
753 2712 : active_local_updates = relmaps->active_local_updates;
754 2712 : }
755 :
756 : /*
757 : * load_relmap_file -- load the shared or local map file
758 : *
759 : * Because these files are essential for access to core system catalogs,
760 : * failure to load either of them is a fatal error.
761 : *
762 : * Note that the local case requires DatabasePath to be set up.
763 : */
764 : static void
765 69472 : load_relmap_file(bool shared, bool lock_held)
766 : {
767 69472 : if (shared)
768 36114 : read_relmap_file(&shared_map, "global", lock_held, FATAL);
769 : else
770 33358 : read_relmap_file(&local_map, DatabasePath, lock_held, FATAL);
771 69472 : }
772 :
773 : /*
774 : * read_relmap_file -- load data from any relation mapper file
775 : *
776 : * dbpath must be the relevant database path, or "global" for shared relations.
777 : *
778 : * RelationMappingLock will be acquired released unless lock_held = true.
779 : *
780 : * Errors will be reported at the indicated elevel, which should be at least
781 : * ERROR.
782 : */
783 : static void
784 77718 : read_relmap_file(RelMapFile *map, char *dbpath, bool lock_held, int elevel)
785 : {
786 : char mapfilename[MAXPGPATH];
787 : pg_crc32c crc;
788 : int fd;
789 : int r;
790 :
791 : Assert(elevel >= ERROR);
792 :
793 : /*
794 : * Grab the lock to prevent the file from being updated while we read it,
795 : * unless the caller is already holding the lock. If the file is updated
796 : * shortly after we look, the sinval signaling mechanism will make us
797 : * re-read it before we are able to access any relation that's affected by
798 : * the change.
799 : */
800 77718 : if (!lock_held)
801 77346 : LWLockAcquire(RelationMappingLock, LW_SHARED);
802 :
803 : /*
804 : * Open the target file.
805 : *
806 : * Because Windows isn't happy about the idea of renaming over a file that
807 : * someone has open, we only open this file after acquiring the lock, and
808 : * for the same reason, we close it before releasing the lock. That way,
809 : * by the time write_relmap_file() acquires an exclusive lock, no one else
810 : * will have it open.
811 : */
812 77718 : snprintf(mapfilename, sizeof(mapfilename), "%s/%s", dbpath,
813 : RELMAPPER_FILENAME);
814 77718 : fd = OpenTransientFile(mapfilename, O_RDONLY | PG_BINARY);
815 77718 : if (fd < 0)
816 0 : ereport(elevel,
817 : (errcode_for_file_access(),
818 : errmsg("could not open file \"%s\": %m",
819 : mapfilename)));
820 :
821 : /* Now read the data. */
822 77718 : pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_READ);
823 77718 : r = read(fd, map, sizeof(RelMapFile));
824 77718 : if (r != sizeof(RelMapFile))
825 : {
826 0 : if (r < 0)
827 0 : ereport(elevel,
828 : (errcode_for_file_access(),
829 : errmsg("could not read file \"%s\": %m", mapfilename)));
830 : else
831 0 : ereport(elevel,
832 : (errcode(ERRCODE_DATA_CORRUPTED),
833 : errmsg("could not read file \"%s\": read %d of %zu",
834 : mapfilename, r, sizeof(RelMapFile))));
835 : }
836 77718 : pgstat_report_wait_end();
837 :
838 77718 : if (CloseTransientFile(fd) != 0)
839 0 : ereport(elevel,
840 : (errcode_for_file_access(),
841 : errmsg("could not close file \"%s\": %m",
842 : mapfilename)));
843 :
844 77718 : if (!lock_held)
845 77346 : LWLockRelease(RelationMappingLock);
846 :
847 : /* check for correct magic number, etc */
848 77718 : if (map->magic != RELMAPPER_FILEMAGIC ||
849 77718 : map->num_mappings < 0 ||
850 77718 : map->num_mappings > MAX_MAPPINGS)
851 0 : ereport(elevel,
852 : (errmsg("relation mapping file \"%s\" contains invalid data",
853 : mapfilename)));
854 :
855 : /* verify the CRC */
856 77718 : INIT_CRC32C(crc);
857 77718 : COMP_CRC32C(crc, (char *) map, offsetof(RelMapFile, crc));
858 77718 : FIN_CRC32C(crc);
859 :
860 77718 : if (!EQ_CRC32C(crc, map->crc))
861 0 : ereport(elevel,
862 : (errmsg("relation mapping file \"%s\" contains incorrect checksum",
863 : mapfilename)));
864 77718 : }
865 :
866 : /*
867 : * Write out a new shared or local map file with the given contents.
868 : *
869 : * The magic number and CRC are automatically updated in *newmap. On
870 : * success, we copy the data to the appropriate permanent static variable.
871 : *
872 : * If write_wal is true then an appropriate WAL message is emitted.
873 : * (It will be false for bootstrap and WAL replay cases.)
874 : *
875 : * If send_sinval is true then a SI invalidation message is sent.
876 : * (This should be true except in bootstrap case.)
877 : *
878 : * If preserve_files is true then the storage manager is warned not to
879 : * delete the files listed in the map.
880 : *
881 : * Because this may be called during WAL replay when MyDatabaseId,
882 : * DatabasePath, etc aren't valid, we require the caller to pass in suitable
883 : * values. Pass dbpath as "global" for the shared map.
884 : *
885 : * The caller is also responsible for being sure no concurrent map update
886 : * could be happening.
887 : */
888 : static void
889 1040 : write_relmap_file(RelMapFile *newmap, bool write_wal, bool send_sinval,
890 : bool preserve_files, Oid dbid, Oid tsid, const char *dbpath)
891 : {
892 : int fd;
893 : char mapfilename[MAXPGPATH];
894 : char maptempfilename[MAXPGPATH];
895 :
896 : /*
897 : * Even without concurrent use of this map, CheckPointRelationMap() relies
898 : * on this locking. Without it, a restore of a base backup taken after
899 : * this function's XLogInsert() and before its durable_rename() would not
900 : * have the changes. wal_level=minimal doesn't need the lock, but this
901 : * isn't performance-critical enough for such a micro-optimization.
902 : */
903 : Assert(LWLockHeldByMeInMode(RelationMappingLock, LW_EXCLUSIVE));
904 :
905 : /*
906 : * Fill in the overhead fields and update CRC.
907 : */
908 1040 : newmap->magic = RELMAPPER_FILEMAGIC;
909 1040 : if (newmap->num_mappings < 0 || newmap->num_mappings > MAX_MAPPINGS)
910 0 : elog(ERROR, "attempt to write bogus relation mapping");
911 :
912 1040 : INIT_CRC32C(newmap->crc);
913 1040 : COMP_CRC32C(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc));
914 1040 : FIN_CRC32C(newmap->crc);
915 :
916 : /*
917 : * Construct filenames -- a temporary file that we'll create to write the
918 : * data initially, and then the permanent name to which we will rename it.
919 : */
920 1040 : snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
921 : dbpath, RELMAPPER_FILENAME);
922 1040 : snprintf(maptempfilename, sizeof(maptempfilename), "%s/%s",
923 : dbpath, RELMAPPER_TEMP_FILENAME);
924 :
925 : /*
926 : * Open a temporary file. If a file already exists with this name, it must
927 : * be left over from a previous crash, so we can overwrite it. Concurrent
928 : * calls to this function are not allowed.
929 : */
930 1040 : fd = OpenTransientFile(maptempfilename,
931 : O_WRONLY | O_CREAT | O_TRUNC | PG_BINARY);
932 1040 : if (fd < 0)
933 0 : ereport(ERROR,
934 : (errcode_for_file_access(),
935 : errmsg("could not open file \"%s\": %m",
936 : maptempfilename)));
937 :
938 : /* Write new data to the file. */
939 1040 : pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_WRITE);
940 1040 : if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile))
941 : {
942 : /* if write didn't set errno, assume problem is no disk space */
943 0 : if (errno == 0)
944 0 : errno = ENOSPC;
945 0 : ereport(ERROR,
946 : (errcode_for_file_access(),
947 : errmsg("could not write file \"%s\": %m",
948 : maptempfilename)));
949 : }
950 1040 : pgstat_report_wait_end();
951 :
952 : /* And close the file. */
953 1040 : if (CloseTransientFile(fd) != 0)
954 0 : ereport(ERROR,
955 : (errcode_for_file_access(),
956 : errmsg("could not close file \"%s\": %m",
957 : maptempfilename)));
958 :
959 1040 : if (write_wal)
960 : {
961 : xl_relmap_update xlrec;
962 : XLogRecPtr lsn;
963 :
964 : /* now errors are fatal ... */
965 806 : START_CRIT_SECTION();
966 :
967 806 : xlrec.dbid = dbid;
968 806 : xlrec.tsid = tsid;
969 806 : xlrec.nbytes = sizeof(RelMapFile);
970 :
971 806 : XLogBeginInsert();
972 806 : XLogRegisterData((char *) (&xlrec), MinSizeOfRelmapUpdate);
973 806 : XLogRegisterData((char *) newmap, sizeof(RelMapFile));
974 :
975 806 : lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE);
976 :
977 : /* As always, WAL must hit the disk before the data update does */
978 806 : XLogFlush(lsn);
979 : }
980 :
981 : /*
982 : * durable_rename() does all the hard work of making sure that we rename
983 : * the temporary file into place in a crash-safe manner.
984 : *
985 : * NB: Although we instruct durable_rename() to use ERROR, we will often
986 : * be in a critical section at this point; if so, ERROR will become PANIC.
987 : */
988 1040 : pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_REPLACE);
989 1040 : durable_rename(maptempfilename, mapfilename, ERROR);
990 1040 : pgstat_report_wait_end();
991 :
992 : /*
993 : * Now that the file is safely on disk, send sinval message to let other
994 : * backends know to re-read it. We must do this inside the critical
995 : * section: if for some reason we fail to send the message, we have to
996 : * force a database-wide PANIC. Otherwise other backends might continue
997 : * execution with stale mapping information, which would be catastrophic
998 : * as soon as others began to use the now-committed data.
999 : */
1000 1040 : if (send_sinval)
1001 426 : CacheInvalidateRelmap(dbid);
1002 :
1003 : /*
1004 : * Make sure that the files listed in the map are not deleted if the outer
1005 : * transaction aborts. This had better be within the critical section
1006 : * too: it's not likely to fail, but if it did, we'd arrive at transaction
1007 : * abort with the files still vulnerable. PANICing will leave things in a
1008 : * good state on-disk.
1009 : *
1010 : * Note: we're cheating a little bit here by assuming that mapped files
1011 : * are either in pg_global or the database's default tablespace.
1012 : */
1013 1040 : if (preserve_files)
1014 : {
1015 : int32 i;
1016 :
1017 14012 : for (i = 0; i < newmap->num_mappings; i++)
1018 : {
1019 : RelFileLocator rlocator;
1020 :
1021 13640 : rlocator.spcOid = tsid;
1022 13640 : rlocator.dbOid = dbid;
1023 13640 : rlocator.relNumber = newmap->mappings[i].mapfilenumber;
1024 13640 : RelationPreserveStorage(rlocator, false);
1025 : }
1026 : }
1027 :
1028 : /* Critical section done */
1029 1040 : if (write_wal)
1030 806 : END_CRIT_SECTION();
1031 1040 : }
1032 :
1033 : /*
1034 : * Merge the specified updates into the appropriate "real" map,
1035 : * and write out the changes. This function must be used for committing
1036 : * updates during normal multiuser operation.
1037 : */
1038 : static void
1039 372 : perform_relmap_update(bool shared, const RelMapFile *updates)
1040 : {
1041 : RelMapFile newmap;
1042 :
1043 : /*
1044 : * Anyone updating a relation's mapping info should take exclusive lock on
1045 : * that rel and hold it until commit. This ensures that there will not be
1046 : * concurrent updates on the same mapping value; but there could easily be
1047 : * concurrent updates on different values in the same file. We cover that
1048 : * by acquiring the RelationMappingLock, re-reading the target file to
1049 : * ensure it's up to date, applying the updates, and writing the data
1050 : * before releasing RelationMappingLock.
1051 : *
1052 : * There is only one RelationMappingLock. In principle we could try to
1053 : * have one per mapping file, but it seems unlikely to be worth the
1054 : * trouble.
1055 : */
1056 372 : LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
1057 :
1058 : /* Be certain we see any other updates just made */
1059 372 : load_relmap_file(shared, true);
1060 :
1061 : /* Prepare updated data in a local variable */
1062 372 : if (shared)
1063 236 : memcpy(&newmap, &shared_map, sizeof(RelMapFile));
1064 : else
1065 136 : memcpy(&newmap, &local_map, sizeof(RelMapFile));
1066 :
1067 : /*
1068 : * Apply the updates to newmap. No new mappings should appear, unless
1069 : * somebody is adding indexes to system catalogs.
1070 : */
1071 372 : merge_map_updates(&newmap, updates, allowSystemTableMods);
1072 :
1073 : /* Write out the updated map and do other necessary tasks */
1074 372 : write_relmap_file(&newmap, true, true, true,
1075 : (shared ? InvalidOid : MyDatabaseId),
1076 : (shared ? GLOBALTABLESPACE_OID : MyDatabaseTableSpace),
1077 : (shared ? "global" : DatabasePath));
1078 :
1079 : /*
1080 : * We successfully wrote the updated file, so it's now safe to rely on the
1081 : * new values in this process, too.
1082 : */
1083 372 : if (shared)
1084 236 : memcpy(&shared_map, &newmap, sizeof(RelMapFile));
1085 : else
1086 136 : memcpy(&local_map, &newmap, sizeof(RelMapFile));
1087 :
1088 : /* Now we can release the lock */
1089 372 : LWLockRelease(RelationMappingLock);
1090 372 : }
1091 :
1092 : /*
1093 : * RELMAP resource manager's routines
1094 : */
1095 : void
1096 54 : relmap_redo(XLogReaderState *record)
1097 : {
1098 54 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
1099 :
1100 : /* Backup blocks are not used in relmap records */
1101 : Assert(!XLogRecHasAnyBlockRefs(record));
1102 :
1103 54 : if (info == XLOG_RELMAP_UPDATE)
1104 : {
1105 54 : xl_relmap_update *xlrec = (xl_relmap_update *) XLogRecGetData(record);
1106 : RelMapFile newmap;
1107 : char *dbpath;
1108 :
1109 54 : if (xlrec->nbytes != sizeof(RelMapFile))
1110 0 : elog(PANIC, "relmap_redo: wrong size %u in relmap update record",
1111 : xlrec->nbytes);
1112 54 : memcpy(&newmap, xlrec->data, sizeof(newmap));
1113 :
1114 : /* We need to construct the pathname for this database */
1115 54 : dbpath = GetDatabasePath(xlrec->dbid, xlrec->tsid);
1116 :
1117 : /*
1118 : * Write out the new map and send sinval, but of course don't write a
1119 : * new WAL entry. There's no surrounding transaction to tell to
1120 : * preserve files, either.
1121 : *
1122 : * There shouldn't be anyone else updating relmaps during WAL replay,
1123 : * but grab the lock to interlock against load_relmap_file().
1124 : *
1125 : * Note that we use the same WAL record for updating the relmap of an
1126 : * existing database as we do for creating a new database. In the
1127 : * latter case, taking the relmap log and sending sinval messages is
1128 : * unnecessary, but harmless. If we wanted to avoid it, we could add a
1129 : * flag to the WAL record to indicate which operation is being
1130 : * performed.
1131 : */
1132 54 : LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
1133 54 : write_relmap_file(&newmap, false, true, false,
1134 : xlrec->dbid, xlrec->tsid, dbpath);
1135 54 : LWLockRelease(RelationMappingLock);
1136 :
1137 54 : pfree(dbpath);
1138 : }
1139 : else
1140 0 : elog(PANIC, "relmap_redo: unknown op code %u", info);
1141 54 : }
|