Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * relmapper.c
4 : * Catalog-to-filenumber mapping
5 : *
6 : * For most tables, the physical file underlying the table is specified by
7 : * pg_class.relfilenode. However, that obviously won't work for pg_class
8 : * itself, nor for the other "nailed" catalogs for which we have to be able
9 : * to set up working Relation entries without access to pg_class. It also
10 : * does not work for shared catalogs, since there is no practical way to
11 : * update other databases' pg_class entries when relocating a shared catalog.
12 : * Therefore, for these special catalogs (henceforth referred to as "mapped
13 : * catalogs") we rely on a separately maintained file that shows the mapping
14 : * from catalog OIDs to filenumbers. Each database has a map file for
15 : * its local mapped catalogs, and there is a separate map file for shared
16 : * catalogs. Mapped catalogs have zero in their pg_class.relfilenode entries.
17 : *
18 : * Relocation of a normal table is committed (ie, the new physical file becomes
19 : * authoritative) when the pg_class row update commits. For mapped catalogs,
20 : * the act of updating the map file is effectively commit of the relocation.
21 : * We postpone the file update till just before commit of the transaction
22 : * doing the rewrite, but there is necessarily a window between. Therefore
23 : * mapped catalogs can only be relocated by operations such as VACUUM FULL
24 : * and CLUSTER, which make no transactionally-significant changes: it must be
25 : * safe for the new file to replace the old, even if the transaction itself
26 : * aborts. An important factor here is that the indexes and toast table of
27 : * a mapped catalog must also be mapped, so that the rewrites/relocations of
28 : * all these files commit in a single map file update rather than being tied
29 : * to transaction commit.
30 : *
31 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
32 : * Portions Copyright (c) 1994, Regents of the University of California
33 : *
34 : *
35 : * IDENTIFICATION
36 : * src/backend/utils/cache/relmapper.c
37 : *
38 : *-------------------------------------------------------------------------
39 : */
40 : #include "postgres.h"
41 :
42 : #include <fcntl.h>
43 : #include <sys/stat.h>
44 : #include <unistd.h>
45 :
46 : #include "access/xact.h"
47 : #include "access/xlog.h"
48 : #include "access/xloginsert.h"
49 : #include "catalog/catalog.h"
50 : #include "catalog/pg_tablespace.h"
51 : #include "catalog/storage.h"
52 : #include "miscadmin.h"
53 : #include "pgstat.h"
54 : #include "storage/fd.h"
55 : #include "storage/lwlock.h"
56 : #include "utils/inval.h"
57 : #include "utils/relmapper.h"
58 :
59 :
60 : /*
61 : * The map file is critical data: we have no automatic method for recovering
62 : * from loss or corruption of it. We use a CRC so that we can detect
63 : * corruption. Since the file might be more than one standard-size disk
64 : * sector in size, we cannot rely on overwrite-in-place. Instead, we generate
65 : * a new file and rename it into place, atomically replacing the original file.
66 : *
67 : * Entries in the mappings[] array are in no particular order. We could
68 : * speed searching by insisting on OID order, but it really shouldn't be
69 : * worth the trouble given the intended size of the mapping sets.
70 : */
71 : #define RELMAPPER_FILENAME "pg_filenode.map"
72 : #define RELMAPPER_TEMP_FILENAME "pg_filenode.map.tmp"
73 :
74 : #define RELMAPPER_FILEMAGIC 0x592717 /* version ID value */
75 :
76 : /*
77 : * There's no need for this constant to have any particular value, and we
78 : * can raise it as necessary if we end up with more mapped relations. For
79 : * now, we just pick a round number that is modestly larger than the expected
80 : * number of mappings.
81 : */
82 : #define MAX_MAPPINGS 64
83 :
84 : typedef struct RelMapping
85 : {
86 : Oid mapoid; /* OID of a catalog */
87 : RelFileNumber mapfilenumber; /* its rel file number */
88 : } RelMapping;
89 :
90 : typedef struct RelMapFile
91 : {
92 : int32 magic; /* always RELMAPPER_FILEMAGIC */
93 : int32 num_mappings; /* number of valid RelMapping entries */
94 : RelMapping mappings[MAX_MAPPINGS];
95 : pg_crc32c crc; /* CRC of all above */
96 : } RelMapFile;
97 :
98 : /*
99 : * State for serializing local and shared relmappings for parallel workers
100 : * (active states only). See notes on active_* and pending_* updates state.
101 : */
102 : typedef struct SerializedActiveRelMaps
103 : {
104 : RelMapFile active_shared_updates;
105 : RelMapFile active_local_updates;
106 : } SerializedActiveRelMaps;
107 :
108 : /*
109 : * The currently known contents of the shared map file and our database's
110 : * local map file are stored here. These can be reloaded from disk
111 : * immediately whenever we receive an update sinval message.
112 : */
113 : static RelMapFile shared_map;
114 : static RelMapFile local_map;
115 :
116 : /*
117 : * We use the same RelMapFile data structure to track uncommitted local
118 : * changes in the mappings (but note the magic and crc fields are not made
119 : * valid in these variables). Currently, map updates are not allowed within
120 : * subtransactions, so one set of transaction-level changes is sufficient.
121 : *
122 : * The active_xxx variables contain updates that are valid in our transaction
123 : * and should be honored by RelationMapOidToFilenumber. The pending_xxx
124 : * variables contain updates we have been told about that aren't active yet;
125 : * they will become active at the next CommandCounterIncrement. This setup
126 : * lets map updates act similarly to updates of pg_class rows, ie, they
127 : * become visible only at the next CommandCounterIncrement boundary.
128 : *
129 : * Active shared and active local updates are serialized by the parallel
130 : * infrastructure, and deserialized within parallel workers.
131 : */
132 : static RelMapFile active_shared_updates;
133 : static RelMapFile active_local_updates;
134 : static RelMapFile pending_shared_updates;
135 : static RelMapFile pending_local_updates;
136 :
137 :
138 : /* non-export function prototypes */
139 : static void apply_map_update(RelMapFile *map, Oid relationId,
140 : RelFileNumber fileNumber, bool add_okay);
141 : static void merge_map_updates(RelMapFile *map, const RelMapFile *updates,
142 : bool add_okay);
143 : static void load_relmap_file(bool shared, bool lock_held);
144 : static void read_relmap_file(RelMapFile *map, char *dbpath, bool lock_held,
145 : int elevel);
146 : static void write_relmap_file(RelMapFile *newmap, bool write_wal,
147 : bool send_sinval, bool preserve_files,
148 : Oid dbid, Oid tsid, const char *dbpath);
149 : static void perform_relmap_update(bool shared, const RelMapFile *updates);
150 :
151 :
152 : /*
153 : * RelationMapOidToFilenumber
154 : *
155 : * The raison d' etre ... given a relation OID, look up its filenumber.
156 : *
157 : * Although shared and local relation OIDs should never overlap, the caller
158 : * always knows which we need --- so pass that information to avoid useless
159 : * searching.
160 : *
161 : * Returns InvalidRelFileNumber if the OID is not known (which should never
162 : * happen, but the caller is in a better position to report a meaningful
163 : * error).
164 : */
165 : RelFileNumber
166 1115704 : RelationMapOidToFilenumber(Oid relationId, bool shared)
167 : {
168 : const RelMapFile *map;
169 : int32 i;
170 :
171 : /* If there are active updates, believe those over the main maps */
172 1115704 : if (shared)
173 : {
174 681988 : map = &active_shared_updates;
175 683748 : for (i = 0; i < map->num_mappings; i++)
176 : {
177 2574 : if (relationId == map->mappings[i].mapoid)
178 814 : return map->mappings[i].mapfilenumber;
179 : }
180 681174 : map = &shared_map;
181 17707484 : for (i = 0; i < map->num_mappings; i++)
182 : {
183 17707484 : if (relationId == map->mappings[i].mapoid)
184 681174 : return map->mappings[i].mapfilenumber;
185 : }
186 : }
187 : else
188 : {
189 433716 : map = &active_local_updates;
190 436410 : for (i = 0; i < map->num_mappings; i++)
191 : {
192 4264 : if (relationId == map->mappings[i].mapoid)
193 1570 : return map->mappings[i].mapfilenumber;
194 : }
195 432146 : map = &local_map;
196 3228962 : for (i = 0; i < map->num_mappings; i++)
197 : {
198 3228962 : if (relationId == map->mappings[i].mapoid)
199 432146 : return map->mappings[i].mapfilenumber;
200 : }
201 : }
202 :
203 0 : return InvalidRelFileNumber;
204 : }
205 :
206 : /*
207 : * RelationMapFilenumberToOid
208 : *
209 : * Do the reverse of the normal direction of mapping done in
210 : * RelationMapOidToFilenumber.
211 : *
212 : * This is not supposed to be used during normal running but rather for
213 : * information purposes when looking at the filesystem or xlog.
214 : *
215 : * Returns InvalidOid if the OID is not known; this can easily happen if the
216 : * relfilenumber doesn't pertain to a mapped relation.
217 : */
218 : Oid
219 1036 : RelationMapFilenumberToOid(RelFileNumber filenumber, bool shared)
220 : {
221 : const RelMapFile *map;
222 : int32 i;
223 :
224 : /* If there are active updates, believe those over the main maps */
225 1036 : if (shared)
226 : {
227 314 : map = &active_shared_updates;
228 314 : for (i = 0; i < map->num_mappings; i++)
229 : {
230 0 : if (filenumber == map->mappings[i].mapfilenumber)
231 0 : return map->mappings[i].mapoid;
232 : }
233 314 : map = &shared_map;
234 7988 : for (i = 0; i < map->num_mappings; i++)
235 : {
236 7988 : if (filenumber == map->mappings[i].mapfilenumber)
237 314 : return map->mappings[i].mapoid;
238 : }
239 : }
240 : else
241 : {
242 722 : map = &active_local_updates;
243 722 : for (i = 0; i < map->num_mappings; i++)
244 : {
245 0 : if (filenumber == map->mappings[i].mapfilenumber)
246 0 : return map->mappings[i].mapoid;
247 : }
248 722 : map = &local_map;
249 4024 : for (i = 0; i < map->num_mappings; i++)
250 : {
251 3918 : if (filenumber == map->mappings[i].mapfilenumber)
252 616 : return map->mappings[i].mapoid;
253 : }
254 : }
255 :
256 106 : return InvalidOid;
257 : }
258 :
259 : /*
260 : * RelationMapOidToFilenumberForDatabase
261 : *
262 : * Like RelationMapOidToFilenumber, but reads the mapping from the indicated
263 : * path instead of using the one for the current database.
264 : */
265 : RelFileNumber
266 7308 : RelationMapOidToFilenumberForDatabase(char *dbpath, Oid relationId)
267 : {
268 : RelMapFile map;
269 : int i;
270 :
271 : /* Read the relmap file from the source database. */
272 7308 : read_relmap_file(&map, dbpath, false, ERROR);
273 :
274 : /* Iterate over the relmap entries to find the input relation OID. */
275 62524 : for (i = 0; i < map.num_mappings; i++)
276 : {
277 62524 : if (relationId == map.mappings[i].mapoid)
278 7308 : return map.mappings[i].mapfilenumber;
279 : }
280 :
281 0 : return InvalidRelFileNumber;
282 : }
283 :
284 : /*
285 : * RelationMapCopy
286 : *
287 : * Copy relmapfile from source db path to the destination db path and WAL log
288 : * the operation. This is intended for use in creating a new relmap file
289 : * for a database that doesn't have one yet, not for replacing an existing
290 : * relmap file.
291 : */
292 : void
293 406 : RelationMapCopy(Oid dbid, Oid tsid, char *srcdbpath, char *dstdbpath)
294 : {
295 : RelMapFile map;
296 :
297 : /*
298 : * Read the relmap file from the source database.
299 : */
300 406 : read_relmap_file(&map, srcdbpath, false, ERROR);
301 :
302 : /*
303 : * Write the same data into the destination database's relmap file.
304 : *
305 : * No sinval is needed because no one can be connected to the destination
306 : * database yet. For the same reason, there is no need to acquire
307 : * RelationMappingLock.
308 : *
309 : * There's no point in trying to preserve files here. The new database
310 : * isn't usable yet anyway, and won't ever be if we can't install a relmap
311 : * file.
312 : */
313 406 : write_relmap_file(&map, true, false, false, dbid, tsid, dstdbpath);
314 406 : }
315 :
316 : /*
317 : * RelationMapUpdateMap
318 : *
319 : * Install a new relfilenumber mapping for the specified relation.
320 : *
321 : * If immediate is true (or we're bootstrapping), the mapping is activated
322 : * immediately. Otherwise it is made pending until CommandCounterIncrement.
323 : */
324 : void
325 5458 : RelationMapUpdateMap(Oid relationId, RelFileNumber fileNumber, bool shared,
326 : bool immediate)
327 : {
328 : RelMapFile *map;
329 :
330 5458 : if (IsBootstrapProcessingMode())
331 : {
332 : /*
333 : * In bootstrap mode, the mapping gets installed in permanent map.
334 : */
335 4544 : if (shared)
336 3200 : map = &shared_map;
337 : else
338 1344 : map = &local_map;
339 : }
340 : else
341 : {
342 : /*
343 : * We don't currently support map changes within subtransactions, or
344 : * when in parallel mode. This could be done with more bookkeeping
345 : * infrastructure, but it doesn't presently seem worth it.
346 : */
347 914 : if (GetCurrentTransactionNestLevel() > 1)
348 0 : elog(ERROR, "cannot change relation mapping within subtransaction");
349 :
350 914 : if (IsInParallelMode())
351 0 : elog(ERROR, "cannot change relation mapping in parallel mode");
352 :
353 914 : if (immediate)
354 : {
355 : /* Make it active, but only locally */
356 162 : if (shared)
357 0 : map = &active_shared_updates;
358 : else
359 162 : map = &active_local_updates;
360 : }
361 : else
362 : {
363 : /* Make it pending */
364 752 : if (shared)
365 352 : map = &pending_shared_updates;
366 : else
367 400 : map = &pending_local_updates;
368 : }
369 : }
370 5458 : apply_map_update(map, relationId, fileNumber, true);
371 5458 : }
372 :
373 : /*
374 : * apply_map_update
375 : *
376 : * Insert a new mapping into the given map variable, replacing any existing
377 : * mapping for the same relation.
378 : *
379 : * In some cases the caller knows there must be an existing mapping; pass
380 : * add_okay = false to draw an error if not.
381 : */
382 : static void
383 6768 : apply_map_update(RelMapFile *map, Oid relationId, RelFileNumber fileNumber,
384 : bool add_okay)
385 : {
386 : int32 i;
387 :
388 : /* Replace any existing mapping */
389 107858 : for (i = 0; i < map->num_mappings; i++)
390 : {
391 102090 : if (relationId == map->mappings[i].mapoid)
392 : {
393 1000 : map->mappings[i].mapfilenumber = fileNumber;
394 1000 : return;
395 : }
396 : }
397 :
398 : /* Nope, need to add a new mapping */
399 5768 : if (!add_okay)
400 0 : elog(ERROR, "attempt to apply a mapping to unmapped relation %u",
401 : relationId);
402 5768 : if (map->num_mappings >= MAX_MAPPINGS)
403 0 : elog(ERROR, "ran out of space in relation map");
404 5768 : map->mappings[map->num_mappings].mapoid = relationId;
405 5768 : map->mappings[map->num_mappings].mapfilenumber = fileNumber;
406 5768 : map->num_mappings++;
407 : }
408 :
409 : /*
410 : * merge_map_updates
411 : *
412 : * Merge all the updates in the given pending-update map into the target map.
413 : * This is just a bulk form of apply_map_update.
414 : */
415 : static void
416 702 : merge_map_updates(RelMapFile *map, const RelMapFile *updates, bool add_okay)
417 : {
418 : int32 i;
419 :
420 2012 : for (i = 0; i < updates->num_mappings; i++)
421 : {
422 1310 : apply_map_update(map,
423 : updates->mappings[i].mapoid,
424 : updates->mappings[i].mapfilenumber,
425 : add_okay);
426 : }
427 702 : }
428 :
429 : /*
430 : * RelationMapRemoveMapping
431 : *
432 : * Remove a relation's entry in the map. This is only allowed for "active"
433 : * (but not committed) local mappings. We need it so we can back out the
434 : * entry for the transient target file when doing VACUUM FULL/CLUSTER on
435 : * a mapped relation.
436 : */
437 : void
438 162 : RelationMapRemoveMapping(Oid relationId)
439 : {
440 162 : RelMapFile *map = &active_local_updates;
441 : int32 i;
442 :
443 254 : for (i = 0; i < map->num_mappings; i++)
444 : {
445 254 : if (relationId == map->mappings[i].mapoid)
446 : {
447 : /* Found it, collapse it out */
448 162 : map->mappings[i] = map->mappings[map->num_mappings - 1];
449 162 : map->num_mappings--;
450 162 : return;
451 : }
452 : }
453 0 : elog(ERROR, "could not find temporary mapping for relation %u",
454 : relationId);
455 : }
456 :
457 : /*
458 : * RelationMapInvalidate
459 : *
460 : * This routine is invoked for SI cache flush messages. We must re-read
461 : * the indicated map file. However, we might receive a SI message in a
462 : * process that hasn't yet, and might never, load the mapping files;
463 : * for example the autovacuum launcher, which *must not* try to read
464 : * a local map since it is attached to no particular database.
465 : * So, re-read only if the map is valid now.
466 : */
467 : void
468 268 : RelationMapInvalidate(bool shared)
469 : {
470 268 : if (shared)
471 : {
472 170 : if (shared_map.magic == RELMAPPER_FILEMAGIC)
473 170 : load_relmap_file(true, false);
474 : }
475 : else
476 : {
477 98 : if (local_map.magic == RELMAPPER_FILEMAGIC)
478 98 : load_relmap_file(false, false);
479 : }
480 268 : }
481 :
482 : /*
483 : * RelationMapInvalidateAll
484 : *
485 : * Reload all map files. This is used to recover from SI message buffer
486 : * overflow: we can't be sure if we missed an inval message.
487 : * Again, reload only currently-valid maps.
488 : */
489 : void
490 4072 : RelationMapInvalidateAll(void)
491 : {
492 4072 : if (shared_map.magic == RELMAPPER_FILEMAGIC)
493 4072 : load_relmap_file(true, false);
494 4072 : if (local_map.magic == RELMAPPER_FILEMAGIC)
495 4000 : load_relmap_file(false, false);
496 4072 : }
497 :
498 : /*
499 : * AtCCI_RelationMap
500 : *
501 : * Activate any "pending" relation map updates at CommandCounterIncrement time.
502 : */
503 : void
504 917350 : AtCCI_RelationMap(void)
505 : {
506 917350 : if (pending_shared_updates.num_mappings != 0)
507 : {
508 288 : merge_map_updates(&active_shared_updates,
509 : &pending_shared_updates,
510 : true);
511 288 : pending_shared_updates.num_mappings = 0;
512 : }
513 917350 : if (pending_local_updates.num_mappings != 0)
514 : {
515 250 : merge_map_updates(&active_local_updates,
516 : &pending_local_updates,
517 : true);
518 250 : pending_local_updates.num_mappings = 0;
519 : }
520 917350 : }
521 :
522 : /*
523 : * AtEOXact_RelationMap
524 : *
525 : * Handle relation mapping at main-transaction commit or abort.
526 : *
527 : * During commit, this must be called as late as possible before the actual
528 : * transaction commit, so as to minimize the window where the transaction
529 : * could still roll back after committing map changes. Although nothing
530 : * critically bad happens in such a case, we still would prefer that it
531 : * not happen, since we'd possibly be losing useful updates to the relations'
532 : * pg_class row(s).
533 : *
534 : * During abort, we just have to throw away any pending map changes.
535 : * Normal post-abort cleanup will take care of fixing relcache entries.
536 : * Parallel worker commit/abort is handled by resetting active mappings
537 : * that may have been received from the leader process. (There should be
538 : * no pending updates in parallel workers.)
539 : */
540 : void
541 515432 : AtEOXact_RelationMap(bool isCommit, bool isParallelWorker)
542 : {
543 515432 : if (isCommit && !isParallelWorker)
544 : {
545 : /*
546 : * We should not get here with any "pending" updates. (We could
547 : * logically choose to treat such as committed, but in the current
548 : * code this should never happen.)
549 : */
550 : Assert(pending_shared_updates.num_mappings == 0);
551 : Assert(pending_local_updates.num_mappings == 0);
552 :
553 : /*
554 : * Write any active updates to the actual map files, then reset them.
555 : */
556 471456 : if (active_shared_updates.num_mappings != 0)
557 : {
558 104 : perform_relmap_update(true, &active_shared_updates);
559 104 : active_shared_updates.num_mappings = 0;
560 : }
561 471456 : if (active_local_updates.num_mappings != 0)
562 : {
563 60 : perform_relmap_update(false, &active_local_updates);
564 60 : active_local_updates.num_mappings = 0;
565 : }
566 : }
567 : else
568 : {
569 : /* Abort or parallel worker --- drop all local and pending updates */
570 : Assert(!isParallelWorker || pending_shared_updates.num_mappings == 0);
571 : Assert(!isParallelWorker || pending_local_updates.num_mappings == 0);
572 :
573 43976 : active_shared_updates.num_mappings = 0;
574 43976 : active_local_updates.num_mappings = 0;
575 43976 : pending_shared_updates.num_mappings = 0;
576 43976 : pending_local_updates.num_mappings = 0;
577 : }
578 515432 : }
579 :
580 : /*
581 : * AtPrepare_RelationMap
582 : *
583 : * Handle relation mapping at PREPARE.
584 : *
585 : * Currently, we don't support preparing any transaction that changes the map.
586 : */
587 : void
588 756 : AtPrepare_RelationMap(void)
589 : {
590 756 : if (active_shared_updates.num_mappings != 0 ||
591 756 : active_local_updates.num_mappings != 0 ||
592 756 : pending_shared_updates.num_mappings != 0 ||
593 756 : pending_local_updates.num_mappings != 0)
594 0 : ereport(ERROR,
595 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
596 : errmsg("cannot PREPARE a transaction that modified relation mapping")));
597 756 : }
598 :
599 : /*
600 : * CheckPointRelationMap
601 : *
602 : * This is called during a checkpoint. It must ensure that any relation map
603 : * updates that were WAL-logged before the start of the checkpoint are
604 : * securely flushed to disk and will not need to be replayed later. This
605 : * seems unlikely to be a performance-critical issue, so we use a simple
606 : * method: we just take and release the RelationMappingLock. This ensures
607 : * that any already-logged map update is complete, because write_relmap_file
608 : * will fsync the map file before the lock is released.
609 : */
610 : void
611 1512 : CheckPointRelationMap(void)
612 : {
613 1512 : LWLockAcquire(RelationMappingLock, LW_SHARED);
614 1512 : LWLockRelease(RelationMappingLock);
615 1512 : }
616 :
617 : /*
618 : * RelationMapFinishBootstrap
619 : *
620 : * Write out the initial relation mapping files at the completion of
621 : * bootstrap. All the mapped files should have been made known to us
622 : * via RelationMapUpdateMap calls.
623 : */
624 : void
625 64 : RelationMapFinishBootstrap(void)
626 : {
627 : Assert(IsBootstrapProcessingMode());
628 :
629 : /* Shouldn't be anything "pending" ... */
630 : Assert(active_shared_updates.num_mappings == 0);
631 : Assert(active_local_updates.num_mappings == 0);
632 : Assert(pending_shared_updates.num_mappings == 0);
633 : Assert(pending_local_updates.num_mappings == 0);
634 :
635 : /* Write the files; no WAL or sinval needed */
636 64 : write_relmap_file(&shared_map, false, false, false,
637 : InvalidOid, GLOBALTABLESPACE_OID, "global");
638 64 : write_relmap_file(&local_map, false, false, false,
639 : MyDatabaseId, MyDatabaseTableSpace, DatabasePath);
640 64 : }
641 :
642 : /*
643 : * RelationMapInitialize
644 : *
645 : * This initializes the mapper module at process startup. We can't access the
646 : * database yet, so just make sure the maps are empty.
647 : */
648 : void
649 24262 : RelationMapInitialize(void)
650 : {
651 : /* The static variables should initialize to zeroes, but let's be sure */
652 24262 : shared_map.magic = 0; /* mark it not loaded */
653 24262 : local_map.magic = 0;
654 24262 : shared_map.num_mappings = 0;
655 24262 : local_map.num_mappings = 0;
656 24262 : active_shared_updates.num_mappings = 0;
657 24262 : active_local_updates.num_mappings = 0;
658 24262 : pending_shared_updates.num_mappings = 0;
659 24262 : pending_local_updates.num_mappings = 0;
660 24262 : }
661 :
662 : /*
663 : * RelationMapInitializePhase2
664 : *
665 : * This is called to prepare for access to pg_database during startup.
666 : * We should be able to read the shared map file now.
667 : */
668 : void
669 24262 : RelationMapInitializePhase2(void)
670 : {
671 : /*
672 : * In bootstrap mode, the map file isn't there yet, so do nothing.
673 : */
674 24262 : if (IsBootstrapProcessingMode())
675 64 : return;
676 :
677 : /*
678 : * Load the shared map file, die on error.
679 : */
680 24198 : load_relmap_file(true, false);
681 : }
682 :
683 : /*
684 : * RelationMapInitializePhase3
685 : *
686 : * This is called as soon as we have determined MyDatabaseId and set up
687 : * DatabasePath. At this point we should be able to read the local map file.
688 : */
689 : void
690 22154 : RelationMapInitializePhase3(void)
691 : {
692 : /*
693 : * In bootstrap mode, the map file isn't there yet, so do nothing.
694 : */
695 22154 : if (IsBootstrapProcessingMode())
696 64 : return;
697 :
698 : /*
699 : * Load the local map file, die on error.
700 : */
701 22090 : load_relmap_file(false, false);
702 : }
703 :
704 : /*
705 : * EstimateRelationMapSpace
706 : *
707 : * Estimate space needed to pass active shared and local relmaps to parallel
708 : * workers.
709 : */
710 : Size
711 804 : EstimateRelationMapSpace(void)
712 : {
713 804 : return sizeof(SerializedActiveRelMaps);
714 : }
715 :
716 : /*
717 : * SerializeRelationMap
718 : *
719 : * Serialize active shared and local relmap state for parallel workers.
720 : */
721 : void
722 804 : SerializeRelationMap(Size maxSize, char *startAddress)
723 : {
724 : SerializedActiveRelMaps *relmaps;
725 :
726 : Assert(maxSize >= EstimateRelationMapSpace());
727 :
728 804 : relmaps = (SerializedActiveRelMaps *) startAddress;
729 804 : relmaps->active_shared_updates = active_shared_updates;
730 804 : relmaps->active_local_updates = active_local_updates;
731 804 : }
732 :
733 : /*
734 : * RestoreRelationMap
735 : *
736 : * Restore active shared and local relmap state within a parallel worker.
737 : */
738 : void
739 2590 : RestoreRelationMap(char *startAddress)
740 : {
741 : SerializedActiveRelMaps *relmaps;
742 :
743 2590 : if (active_shared_updates.num_mappings != 0 ||
744 2590 : active_local_updates.num_mappings != 0 ||
745 2590 : pending_shared_updates.num_mappings != 0 ||
746 2590 : pending_local_updates.num_mappings != 0)
747 0 : elog(ERROR, "parallel worker has existing mappings");
748 :
749 2590 : relmaps = (SerializedActiveRelMaps *) startAddress;
750 2590 : active_shared_updates = relmaps->active_shared_updates;
751 2590 : active_local_updates = relmaps->active_local_updates;
752 2590 : }
753 :
754 : /*
755 : * load_relmap_file -- load the shared or local map file
756 : *
757 : * Because these files are essential for access to core system catalogs,
758 : * failure to load either of them is a fatal error.
759 : *
760 : * Note that the local case requires DatabasePath to be set up.
761 : */
762 : static void
763 54792 : load_relmap_file(bool shared, bool lock_held)
764 : {
765 54792 : if (shared)
766 28544 : read_relmap_file(&shared_map, "global", lock_held, FATAL);
767 : else
768 26248 : read_relmap_file(&local_map, DatabasePath, lock_held, FATAL);
769 54792 : }
770 :
771 : /*
772 : * read_relmap_file -- load data from any relation mapper file
773 : *
774 : * dbpath must be the relevant database path, or "global" for shared relations.
775 : *
776 : * RelationMappingLock will be acquired released unless lock_held = true.
777 : *
778 : * Errors will be reported at the indicated elevel, which should be at least
779 : * ERROR.
780 : */
781 : static void
782 62506 : read_relmap_file(RelMapFile *map, char *dbpath, bool lock_held, int elevel)
783 : {
784 : char mapfilename[MAXPGPATH];
785 : pg_crc32c crc;
786 : int fd;
787 : int r;
788 :
789 : Assert(elevel >= ERROR);
790 :
791 : /*
792 : * Grab the lock to prevent the file from being updated while we read it,
793 : * unless the caller is already holding the lock. If the file is updated
794 : * shortly after we look, the sinval signaling mechanism will make us
795 : * re-read it before we are able to access any relation that's affected by
796 : * the change.
797 : */
798 62506 : if (!lock_held)
799 62342 : LWLockAcquire(RelationMappingLock, LW_SHARED);
800 :
801 : /*
802 : * Open the target file.
803 : *
804 : * Because Windows isn't happy about the idea of renaming over a file that
805 : * someone has open, we only open this file after acquiring the lock, and
806 : * for the same reason, we close it before releasing the lock. That way,
807 : * by the time write_relmap_file() acquires an exclusive lock, no one else
808 : * will have it open.
809 : */
810 62506 : snprintf(mapfilename, sizeof(mapfilename), "%s/%s", dbpath,
811 : RELMAPPER_FILENAME);
812 62506 : fd = OpenTransientFile(mapfilename, O_RDONLY | PG_BINARY);
813 62506 : if (fd < 0)
814 0 : ereport(elevel,
815 : (errcode_for_file_access(),
816 : errmsg("could not open file \"%s\": %m",
817 : mapfilename)));
818 :
819 : /* Now read the data. */
820 62506 : pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_READ);
821 62506 : r = read(fd, map, sizeof(RelMapFile));
822 62506 : if (r != sizeof(RelMapFile))
823 : {
824 0 : if (r < 0)
825 0 : ereport(elevel,
826 : (errcode_for_file_access(),
827 : errmsg("could not read file \"%s\": %m", mapfilename)));
828 : else
829 0 : ereport(elevel,
830 : (errcode(ERRCODE_DATA_CORRUPTED),
831 : errmsg("could not read file \"%s\": read %d of %zu",
832 : mapfilename, r, sizeof(RelMapFile))));
833 : }
834 62506 : pgstat_report_wait_end();
835 :
836 62506 : if (CloseTransientFile(fd) != 0)
837 0 : ereport(elevel,
838 : (errcode_for_file_access(),
839 : errmsg("could not close file \"%s\": %m",
840 : mapfilename)));
841 :
842 62506 : if (!lock_held)
843 62342 : LWLockRelease(RelationMappingLock);
844 :
845 : /* check for correct magic number, etc */
846 62506 : if (map->magic != RELMAPPER_FILEMAGIC ||
847 62506 : map->num_mappings < 0 ||
848 62506 : map->num_mappings > MAX_MAPPINGS)
849 0 : ereport(elevel,
850 : (errmsg("relation mapping file \"%s\" contains invalid data",
851 : mapfilename)));
852 :
853 : /* verify the CRC */
854 62506 : INIT_CRC32C(crc);
855 62506 : COMP_CRC32C(crc, (char *) map, offsetof(RelMapFile, crc));
856 62506 : FIN_CRC32C(crc);
857 :
858 62506 : if (!EQ_CRC32C(crc, map->crc))
859 0 : ereport(elevel,
860 : (errmsg("relation mapping file \"%s\" contains incorrect checksum",
861 : mapfilename)));
862 62506 : }
863 :
864 : /*
865 : * Write out a new shared or local map file with the given contents.
866 : *
867 : * The magic number and CRC are automatically updated in *newmap. On
868 : * success, we copy the data to the appropriate permanent static variable.
869 : *
870 : * If write_wal is true then an appropriate WAL message is emitted.
871 : * (It will be false for bootstrap and WAL replay cases.)
872 : *
873 : * If send_sinval is true then a SI invalidation message is sent.
874 : * (This should be true except in bootstrap case.)
875 : *
876 : * If preserve_files is true then the storage manager is warned not to
877 : * delete the files listed in the map.
878 : *
879 : * Because this may be called during WAL replay when MyDatabaseId,
880 : * DatabasePath, etc aren't valid, we require the caller to pass in suitable
881 : * values. Pass dbpath as "global" for the shared map.
882 : *
883 : * The caller is also responsible for being sure no concurrent map update
884 : * could be happening.
885 : */
886 : static void
887 742 : write_relmap_file(RelMapFile *newmap, bool write_wal, bool send_sinval,
888 : bool preserve_files, Oid dbid, Oid tsid, const char *dbpath)
889 : {
890 : int fd;
891 : char mapfilename[MAXPGPATH];
892 : char maptempfilename[MAXPGPATH];
893 :
894 : /*
895 : * Fill in the overhead fields and update CRC.
896 : */
897 742 : newmap->magic = RELMAPPER_FILEMAGIC;
898 742 : if (newmap->num_mappings < 0 || newmap->num_mappings > MAX_MAPPINGS)
899 0 : elog(ERROR, "attempt to write bogus relation mapping");
900 :
901 742 : INIT_CRC32C(newmap->crc);
902 742 : COMP_CRC32C(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc));
903 742 : FIN_CRC32C(newmap->crc);
904 :
905 : /*
906 : * Construct filenames -- a temporary file that we'll create to write the
907 : * data initially, and then the permanent name to which we will rename it.
908 : */
909 742 : snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
910 : dbpath, RELMAPPER_FILENAME);
911 742 : snprintf(maptempfilename, sizeof(maptempfilename), "%s/%s",
912 : dbpath, RELMAPPER_TEMP_FILENAME);
913 :
914 : /*
915 : * Open a temporary file. If a file already exists with this name, it must
916 : * be left over from a previous crash, so we can overwrite it. Concurrent
917 : * calls to this function are not allowed.
918 : */
919 742 : fd = OpenTransientFile(maptempfilename,
920 : O_WRONLY | O_CREAT | O_TRUNC | PG_BINARY);
921 742 : if (fd < 0)
922 0 : ereport(ERROR,
923 : (errcode_for_file_access(),
924 : errmsg("could not open file \"%s\": %m",
925 : maptempfilename)));
926 :
927 : /* Write new data to the file. */
928 742 : pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_WRITE);
929 742 : if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile))
930 : {
931 : /* if write didn't set errno, assume problem is no disk space */
932 0 : if (errno == 0)
933 0 : errno = ENOSPC;
934 0 : ereport(ERROR,
935 : (errcode_for_file_access(),
936 : errmsg("could not write file \"%s\": %m",
937 : maptempfilename)));
938 : }
939 742 : pgstat_report_wait_end();
940 :
941 : /* And close the file. */
942 742 : if (CloseTransientFile(fd) != 0)
943 0 : ereport(ERROR,
944 : (errcode_for_file_access(),
945 : errmsg("could not close file \"%s\": %m",
946 : maptempfilename)));
947 :
948 742 : if (write_wal)
949 : {
950 : xl_relmap_update xlrec;
951 : XLogRecPtr lsn;
952 :
953 : /* now errors are fatal ... */
954 570 : START_CRIT_SECTION();
955 :
956 570 : xlrec.dbid = dbid;
957 570 : xlrec.tsid = tsid;
958 570 : xlrec.nbytes = sizeof(RelMapFile);
959 :
960 570 : XLogBeginInsert();
961 570 : XLogRegisterData((char *) (&xlrec), MinSizeOfRelmapUpdate);
962 570 : XLogRegisterData((char *) newmap, sizeof(RelMapFile));
963 :
964 570 : lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE);
965 :
966 : /* As always, WAL must hit the disk before the data update does */
967 570 : XLogFlush(lsn);
968 : }
969 :
970 : /*
971 : * durable_rename() does all the hard work of making sure that we rename
972 : * the temporary file into place in a crash-safe manner.
973 : *
974 : * NB: Although we instruct durable_rename() to use ERROR, we will often
975 : * be in a critical section at this point; if so, ERROR will become PANIC.
976 : */
977 742 : pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_REPLACE);
978 742 : durable_rename(maptempfilename, mapfilename, ERROR);
979 742 : pgstat_report_wait_end();
980 :
981 : /*
982 : * Now that the file is safely on disk, send sinval message to let other
983 : * backends know to re-read it. We must do this inside the critical
984 : * section: if for some reason we fail to send the message, we have to
985 : * force a database-wide PANIC. Otherwise other backends might continue
986 : * execution with stale mapping information, which would be catastrophic
987 : * as soon as others began to use the now-committed data.
988 : */
989 742 : if (send_sinval)
990 208 : CacheInvalidateRelmap(dbid);
991 :
992 : /*
993 : * Make sure that the files listed in the map are not deleted if the outer
994 : * transaction aborts. This had better be within the critical section
995 : * too: it's not likely to fail, but if it did, we'd arrive at transaction
996 : * abort with the files still vulnerable. PANICing will leave things in a
997 : * good state on-disk.
998 : *
999 : * Note: we're cheating a little bit here by assuming that mapped files
1000 : * are either in pg_global or the database's default tablespace.
1001 : */
1002 742 : if (preserve_files)
1003 : {
1004 : int32 i;
1005 :
1006 6384 : for (i = 0; i < newmap->num_mappings; i++)
1007 : {
1008 : RelFileLocator rlocator;
1009 :
1010 6220 : rlocator.spcOid = tsid;
1011 6220 : rlocator.dbOid = dbid;
1012 6220 : rlocator.relNumber = newmap->mappings[i].mapfilenumber;
1013 6220 : RelationPreserveStorage(rlocator, false);
1014 : }
1015 : }
1016 :
1017 : /* Critical section done */
1018 742 : if (write_wal)
1019 570 : END_CRIT_SECTION();
1020 742 : }
1021 :
1022 : /*
1023 : * Merge the specified updates into the appropriate "real" map,
1024 : * and write out the changes. This function must be used for committing
1025 : * updates during normal multiuser operation.
1026 : */
1027 : static void
1028 164 : perform_relmap_update(bool shared, const RelMapFile *updates)
1029 : {
1030 : RelMapFile newmap;
1031 :
1032 : /*
1033 : * Anyone updating a relation's mapping info should take exclusive lock on
1034 : * that rel and hold it until commit. This ensures that there will not be
1035 : * concurrent updates on the same mapping value; but there could easily be
1036 : * concurrent updates on different values in the same file. We cover that
1037 : * by acquiring the RelationMappingLock, re-reading the target file to
1038 : * ensure it's up to date, applying the updates, and writing the data
1039 : * before releasing RelationMappingLock.
1040 : *
1041 : * There is only one RelationMappingLock. In principle we could try to
1042 : * have one per mapping file, but it seems unlikely to be worth the
1043 : * trouble.
1044 : */
1045 164 : LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
1046 :
1047 : /* Be certain we see any other updates just made */
1048 164 : load_relmap_file(shared, true);
1049 :
1050 : /* Prepare updated data in a local variable */
1051 164 : if (shared)
1052 104 : memcpy(&newmap, &shared_map, sizeof(RelMapFile));
1053 : else
1054 60 : memcpy(&newmap, &local_map, sizeof(RelMapFile));
1055 :
1056 : /*
1057 : * Apply the updates to newmap. No new mappings should appear, unless
1058 : * somebody is adding indexes to system catalogs.
1059 : */
1060 164 : merge_map_updates(&newmap, updates, allowSystemTableMods);
1061 :
1062 : /* Write out the updated map and do other necessary tasks */
1063 164 : write_relmap_file(&newmap, true, true, true,
1064 : (shared ? InvalidOid : MyDatabaseId),
1065 : (shared ? GLOBALTABLESPACE_OID : MyDatabaseTableSpace),
1066 : (shared ? "global" : DatabasePath));
1067 :
1068 : /*
1069 : * We successfully wrote the updated file, so it's now safe to rely on the
1070 : * new values in this process, too.
1071 : */
1072 164 : if (shared)
1073 104 : memcpy(&shared_map, &newmap, sizeof(RelMapFile));
1074 : else
1075 60 : memcpy(&local_map, &newmap, sizeof(RelMapFile));
1076 :
1077 : /* Now we can release the lock */
1078 164 : LWLockRelease(RelationMappingLock);
1079 164 : }
1080 :
1081 : /*
1082 : * RELMAP resource manager's routines
1083 : */
1084 : void
1085 44 : relmap_redo(XLogReaderState *record)
1086 : {
1087 44 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
1088 :
1089 : /* Backup blocks are not used in relmap records */
1090 : Assert(!XLogRecHasAnyBlockRefs(record));
1091 :
1092 44 : if (info == XLOG_RELMAP_UPDATE)
1093 : {
1094 44 : xl_relmap_update *xlrec = (xl_relmap_update *) XLogRecGetData(record);
1095 : RelMapFile newmap;
1096 : char *dbpath;
1097 :
1098 44 : if (xlrec->nbytes != sizeof(RelMapFile))
1099 0 : elog(PANIC, "relmap_redo: wrong size %u in relmap update record",
1100 : xlrec->nbytes);
1101 44 : memcpy(&newmap, xlrec->data, sizeof(newmap));
1102 :
1103 : /* We need to construct the pathname for this database */
1104 44 : dbpath = GetDatabasePath(xlrec->dbid, xlrec->tsid);
1105 :
1106 : /*
1107 : * Write out the new map and send sinval, but of course don't write a
1108 : * new WAL entry. There's no surrounding transaction to tell to
1109 : * preserve files, either.
1110 : *
1111 : * There shouldn't be anyone else updating relmaps during WAL replay,
1112 : * but grab the lock to interlock against load_relmap_file().
1113 : *
1114 : * Note that we use the same WAL record for updating the relmap of an
1115 : * existing database as we do for creating a new database. In the
1116 : * latter case, taking the relmap log and sending sinval messages is
1117 : * unnecessary, but harmless. If we wanted to avoid it, we could add a
1118 : * flag to the WAL record to indicate which operation is being
1119 : * performed.
1120 : */
1121 44 : LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
1122 44 : write_relmap_file(&newmap, false, true, false,
1123 : xlrec->dbid, xlrec->tsid, dbpath);
1124 44 : LWLockRelease(RelationMappingLock);
1125 :
1126 44 : pfree(dbpath);
1127 : }
1128 : else
1129 0 : elog(PANIC, "relmap_redo: unknown op code %u", info);
1130 44 : }
|