Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * syncscan.c
4 : * scan synchronization support
5 : *
6 : * When multiple backends run a sequential scan on the same table, we try
7 : * to keep them synchronized to reduce the overall I/O needed. The goal is
8 : * to read each page into shared buffer cache only once, and let all backends
9 : * that take part in the shared scan process the page before it falls out of
10 : * the cache.
11 : *
12 : * Since the "leader" in a pack of backends doing a seqscan will have to wait
13 : * for I/O, while the "followers" don't, there is a strong self-synchronizing
14 : * effect once we can get the backends examining approximately the same part
15 : * of the table at the same time. Hence all that is really needed is to get
16 : * a new backend beginning a seqscan to begin it close to where other backends
17 : * are reading. We can scan the table circularly, from block X up to the
18 : * end and then from block 0 to X-1, to ensure we visit all rows while still
19 : * participating in the common scan.
20 : *
21 : * To accomplish that, we keep track of the scan position of each table, and
22 : * start new scans close to where the previous scan(s) are. We don't try to
23 : * do any extra synchronization to keep the scans together afterwards; some
24 : * scans might progress much more slowly than others, for example if the
25 : * results need to be transferred to the client over a slow network, and we
26 : * don't want such queries to slow down others.
27 : *
28 : * There can realistically only be a few large sequential scans on different
29 : * tables in progress at any time. Therefore we just keep the scan positions
30 : * in a small LRU list which we scan every time we need to look up or update a
31 : * scan position. The whole mechanism is only applied for tables exceeding
32 : * a threshold size (but that is not the concern of this module).
33 : *
34 : * INTERFACE ROUTINES
35 : * ss_get_location - return current scan location of a relation
36 : * ss_report_location - update current scan location
37 : *
38 : *
39 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
40 : * Portions Copyright (c) 1994, Regents of the University of California
41 : *
42 : * IDENTIFICATION
43 : * src/backend/access/common/syncscan.c
44 : *
45 : *-------------------------------------------------------------------------
46 : */
47 : #include "postgres.h"
48 :
49 : #include "access/syncscan.h"
50 : #include "miscadmin.h"
51 : #include "storage/lwlock.h"
52 : #include "storage/shmem.h"
53 : #include "utils/rel.h"
54 :
55 :
56 : /* GUC variables */
57 : #ifdef TRACE_SYNCSCAN
58 : bool trace_syncscan = false;
59 : #endif
60 :
61 :
62 : /*
63 : * Size of the LRU list.
64 : *
65 : * Note: the code assumes that SYNC_SCAN_NELEM > 1.
66 : *
67 : * XXX: What's a good value? It should be large enough to hold the
68 : * maximum number of large tables scanned simultaneously. But a larger value
69 : * means more traversing of the LRU list when starting a new scan.
70 : */
71 : #define SYNC_SCAN_NELEM 20
72 :
73 : /*
74 : * Interval between reports of the location of the current scan, in pages.
75 : *
76 : * Note: This should be smaller than the ring size (see buffer/freelist.c)
77 : * we use for bulk reads. Otherwise a scan joining other scans might start
78 : * from a page that's no longer in the buffer cache. This is a bit fuzzy;
79 : * there's no guarantee that the new scan will read the page before it leaves
80 : * the buffer cache anyway, and on the other hand the page is most likely
81 : * still in the OS cache.
82 : */
83 : #define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ)
84 :
85 :
86 : /*
87 : * The scan locations structure is essentially a doubly-linked LRU with head
88 : * and tail pointer, but designed to hold a fixed maximum number of elements in
89 : * fixed-size shared memory.
90 : */
91 : typedef struct ss_scan_location_t
92 : {
93 : RelFileLocator relfilelocator; /* identity of a relation */
94 : BlockNumber location; /* last-reported location in the relation */
95 : } ss_scan_location_t;
96 :
97 : typedef struct ss_lru_item_t
98 : {
99 : struct ss_lru_item_t *prev;
100 : struct ss_lru_item_t *next;
101 : ss_scan_location_t location;
102 : } ss_lru_item_t;
103 :
104 : typedef struct ss_scan_locations_t
105 : {
106 : ss_lru_item_t *head;
107 : ss_lru_item_t *tail;
108 : ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */
109 : } ss_scan_locations_t;
110 :
111 : #define SizeOfScanLocations(N) \
112 : (offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t))
113 :
114 : /* Pointer to struct in shared memory */
115 : static ss_scan_locations_t *scan_locations;
116 :
117 : /* prototypes for internal functions */
118 : static BlockNumber ss_search(RelFileLocator relfilelocator,
119 : BlockNumber location, bool set);
120 :
121 :
122 : /*
123 : * SyncScanShmemSize --- report amount of shared memory space needed
124 : */
125 : Size
126 3566 : SyncScanShmemSize(void)
127 : {
128 3566 : return SizeOfScanLocations(SYNC_SCAN_NELEM);
129 : }
130 :
131 : /*
132 : * SyncScanShmemInit --- initialize this module's shared memory
133 : */
134 : void
135 1918 : SyncScanShmemInit(void)
136 : {
137 : int i;
138 : bool found;
139 :
140 1918 : scan_locations = (ss_scan_locations_t *)
141 1918 : ShmemInitStruct("Sync Scan Locations List",
142 : SizeOfScanLocations(SYNC_SCAN_NELEM),
143 : &found);
144 :
145 1918 : if (!IsUnderPostmaster)
146 : {
147 : /* Initialize shared memory area */
148 : Assert(!found);
149 :
150 1918 : scan_locations->head = &scan_locations->items[0];
151 1918 : scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1];
152 :
153 40278 : for (i = 0; i < SYNC_SCAN_NELEM; i++)
154 : {
155 38360 : ss_lru_item_t *item = &scan_locations->items[i];
156 :
157 : /*
158 : * Initialize all slots with invalid values. As scans are started,
159 : * these invalid entries will fall off the LRU list and get
160 : * replaced with real entries.
161 : */
162 38360 : item->location.relfilelocator.spcOid = InvalidOid;
163 38360 : item->location.relfilelocator.dbOid = InvalidOid;
164 38360 : item->location.relfilelocator.relNumber = InvalidRelFileNumber;
165 38360 : item->location.location = InvalidBlockNumber;
166 :
167 38360 : item->prev = (i > 0) ?
168 38360 : (&scan_locations->items[i - 1]) : NULL;
169 38360 : item->next = (i < SYNC_SCAN_NELEM - 1) ?
170 38360 : (&scan_locations->items[i + 1]) : NULL;
171 : }
172 : }
173 : else
174 : Assert(found);
175 1918 : }
176 :
177 : /*
178 : * ss_search --- search the scan_locations structure for an entry with the
179 : * given relfilelocator.
180 : *
181 : * If "set" is true, the location is updated to the given location. If no
182 : * entry for the given relfilelocator is found, it will be created at the head
183 : * of the list with the given location, even if "set" is false.
184 : *
185 : * In any case, the location after possible update is returned.
186 : *
187 : * Caller is responsible for having acquired suitable lock on the shared
188 : * data structure.
189 : */
190 : static BlockNumber
191 2404 : ss_search(RelFileLocator relfilelocator, BlockNumber location, bool set)
192 : {
193 : ss_lru_item_t *item;
194 :
195 2404 : item = scan_locations->head;
196 : for (;;)
197 912 : {
198 : bool match;
199 :
200 3316 : match = RelFileLocatorEquals(item->location.relfilelocator,
201 : relfilelocator);
202 :
203 3316 : if (match || item->next == NULL)
204 : {
205 : /*
206 : * If we reached the end of list and no match was found, take over
207 : * the last entry
208 : */
209 2404 : if (!match)
210 : {
211 48 : item->location.relfilelocator = relfilelocator;
212 48 : item->location.location = location;
213 : }
214 2356 : else if (set)
215 2284 : item->location.location = location;
216 :
217 : /* Move the entry to the front of the LRU list */
218 2404 : if (item != scan_locations->head)
219 : {
220 : /* unlink */
221 48 : if (item == scan_locations->tail)
222 48 : scan_locations->tail = item->prev;
223 48 : item->prev->next = item->next;
224 48 : if (item->next)
225 0 : item->next->prev = item->prev;
226 :
227 : /* link */
228 48 : item->prev = NULL;
229 48 : item->next = scan_locations->head;
230 48 : scan_locations->head->prev = item;
231 48 : scan_locations->head = item;
232 : }
233 :
234 2404 : return item->location.location;
235 : }
236 :
237 912 : item = item->next;
238 : }
239 :
240 : /* not reached */
241 : }
242 :
243 : /*
244 : * ss_get_location --- get the optimal starting location for scan
245 : *
246 : * Returns the last-reported location of a sequential scan on the
247 : * relation, or 0 if no valid location is found.
248 : *
249 : * We expect the caller has just done RelationGetNumberOfBlocks(), and
250 : * so that number is passed in rather than computing it again. The result
251 : * is guaranteed less than relnblocks (assuming that's > 0).
252 : */
253 : BlockNumber
254 120 : ss_get_location(Relation rel, BlockNumber relnblocks)
255 : {
256 : BlockNumber startloc;
257 :
258 120 : LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);
259 120 : startloc = ss_search(rel->rd_locator, 0, false);
260 120 : LWLockRelease(SyncScanLock);
261 :
262 : /*
263 : * If the location is not a valid block number for this scan, start at 0.
264 : *
265 : * This can happen if for instance a VACUUM truncated the table since the
266 : * location was saved.
267 : */
268 120 : if (startloc >= relnblocks)
269 0 : startloc = 0;
270 :
271 : #ifdef TRACE_SYNCSCAN
272 : if (trace_syncscan)
273 : elog(LOG,
274 : "SYNC_SCAN: start \"%s\" (size %u) at %u",
275 : RelationGetRelationName(rel), relnblocks, startloc);
276 : #endif
277 :
278 120 : return startloc;
279 : }
280 :
281 : /*
282 : * ss_report_location --- update the current scan location
283 : *
284 : * Writes an entry into the shared Sync Scan state of the form
285 : * (relfilelocator, blocknumber), overwriting any existing entry for the
286 : * same relfilelocator.
287 : */
288 : void
289 35276 : ss_report_location(Relation rel, BlockNumber location)
290 : {
291 : #ifdef TRACE_SYNCSCAN
292 : if (trace_syncscan)
293 : {
294 : if ((location % 1024) == 0)
295 : elog(LOG,
296 : "SYNC_SCAN: scanning \"%s\" at %u",
297 : RelationGetRelationName(rel), location);
298 : }
299 : #endif
300 :
301 : /*
302 : * To reduce lock contention, only report scan progress every N pages. For
303 : * the same reason, don't block if the lock isn't immediately available.
304 : * Missing a few updates isn't critical, it just means that a new scan
305 : * that wants to join the pack will start a little bit behind the head of
306 : * the scan. Hopefully the pages are still in OS cache and the scan
307 : * catches up quickly.
308 : */
309 35276 : if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0)
310 : {
311 2284 : if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE))
312 : {
313 2284 : (void) ss_search(rel->rd_locator, location, true);
314 2284 : LWLockRelease(SyncScanLock);
315 : }
316 : #ifdef TRACE_SYNCSCAN
317 : else if (trace_syncscan)
318 : elog(LOG,
319 : "SYNC_SCAN: missed update for \"%s\" at %u",
320 : RelationGetRelationName(rel), location);
321 : #endif
322 : }
323 35276 : }
|