Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * ginpostinglist.c
4 : * routines for dealing with posting lists.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/gin/ginpostinglist.c
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include "access/gin_private.h"
18 :
19 : #ifdef USE_ASSERT_CHECKING
20 : #define CHECK_ENCODING_ROUNDTRIP
21 : #endif
22 :
23 : /*
24 : * For encoding purposes, item pointers are represented as 64-bit unsigned
25 : * integers. The lowest 11 bits represent the offset number, and the next
26 : * lowest 32 bits are the block number. That leaves 21 bits unused, i.e.
27 : * only 43 low bits are used.
28 : *
29 : * 11 bits is enough for the offset number, because MaxHeapTuplesPerPage <
30 : * 2^11 on all supported block sizes. We are frugal with the bits, because
31 : * smaller integers use fewer bytes in the varbyte encoding, saving disk
32 : * space. (If we get a new table AM in the future that wants to use the full
33 : * range of possible offset numbers, we'll need to change this.)
34 : *
35 : * These 43-bit integers are encoded using varbyte encoding. In each byte,
36 : * the 7 low bits contain data, while the highest bit is a continuation bit.
37 : * When the continuation bit is set, the next byte is part of the same
38 : * integer, otherwise this is the last byte of this integer. 43 bits need
39 : * at most 7 bytes in this encoding:
40 : *
41 : * 0XXXXXXX
42 : * 1XXXXXXX 0XXXXYYY
43 : * 1XXXXXXX 1XXXXYYY 0YYYYYYY
44 : * 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY
45 : * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
46 : * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
47 : * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0uuuuuuY
48 : *
49 : * X = bits used for offset number
50 : * Y = bits used for block number
51 : * u = unused bit
52 : *
53 : * The bytes are in stored in little-endian order.
54 : *
55 : * An important property of this encoding is that removing an item from list
56 : * never increases the size of the resulting compressed posting list. Proof:
57 : *
58 : * Removing number is actually replacement of two numbers with their sum. We
59 : * have to prove that varbyte encoding of a sum can't be longer than varbyte
60 : * encoding of its summands. Sum of two numbers is at most one bit wider than
61 : * the larger of the summands. Widening a number by one bit enlarges its length
62 : * in varbyte encoding by at most one byte. Therefore, varbyte encoding of sum
63 : * is at most one byte longer than varbyte encoding of larger summand. Lesser
64 : * summand is at least one byte, so the sum cannot take more space than the
65 : * summands, Q.E.D.
66 : *
67 : * This property greatly simplifies VACUUM, which can assume that posting
68 : * lists always fit on the same page after vacuuming. Note that even though
69 : * that holds for removing items from a posting list, you must also be
70 : * careful to not cause expansion e.g. when merging uncompressed items on the
71 : * page into the compressed lists, when vacuuming.
72 : */
73 :
74 : /*
75 : * How many bits do you need to encode offset number? OffsetNumber is a 16-bit
76 : * integer, but you can't fit that many items on a page. 11 ought to be more
77 : * than enough. It's tempting to derive this from MaxHeapTuplesPerPage, and
78 : * use the minimum number of bits, but that would require changing the on-disk
79 : * format if MaxHeapTuplesPerPage changes. Better to leave some slack.
80 : */
81 : #define MaxHeapTuplesPerPageBits 11
82 :
83 : /* Max. number of bytes needed to encode the largest supported integer. */
84 : #define MaxBytesPerInteger 7
85 :
86 : static inline uint64
87 34199282 : itemptr_to_uint64(const ItemPointer iptr)
88 : {
89 : uint64 val;
90 :
91 : Assert(ItemPointerIsValid(iptr));
92 : Assert(GinItemPointerGetOffsetNumber(iptr) < (1 << MaxHeapTuplesPerPageBits));
93 :
94 34199282 : val = GinItemPointerGetBlockNumber(iptr);
95 34199282 : val <<= MaxHeapTuplesPerPageBits;
96 34199282 : val |= GinItemPointerGetOffsetNumber(iptr);
97 :
98 34199282 : return val;
99 : }
100 :
101 : static inline void
102 34385800 : uint64_to_itemptr(uint64 val, ItemPointer iptr)
103 : {
104 34385800 : GinItemPointerSetOffsetNumber(iptr, val & ((1 << MaxHeapTuplesPerPageBits) - 1));
105 34385800 : val = val >> MaxHeapTuplesPerPageBits;
106 34385800 : GinItemPointerSetBlockNumber(iptr, val);
107 :
108 : Assert(ItemPointerIsValid(iptr));
109 34385800 : }
110 :
111 : /*
112 : * Varbyte-encode 'val' into *ptr. *ptr is incremented to next integer.
113 : */
114 : static void
115 32792066 : encode_varbyte(uint64 val, unsigned char **ptr)
116 : {
117 32792066 : unsigned char *p = *ptr;
118 :
119 33387052 : while (val > 0x7F)
120 : {
121 594986 : *(p++) = 0x80 | (val & 0x7F);
122 594986 : val >>= 7;
123 : }
124 32792066 : *(p++) = (unsigned char) val;
125 :
126 32792066 : *ptr = p;
127 32792066 : }
128 :
129 : /*
130 : * Decode varbyte-encoded integer at *ptr. *ptr is incremented to next integer.
131 : */
132 : static uint64
133 34385800 : decode_varbyte(unsigned char **ptr)
134 : {
135 : uint64 val;
136 34385800 : unsigned char *p = *ptr;
137 : uint64 c;
138 :
139 : /* 1st byte */
140 34385800 : c = *(p++);
141 34385800 : val = c & 0x7F;
142 34385800 : if (c & 0x80)
143 : {
144 : /* 2nd byte */
145 1245448 : c = *(p++);
146 1245448 : val |= (c & 0x7F) << 7;
147 1245448 : if (c & 0x80)
148 : {
149 : /* 3rd byte */
150 87332 : c = *(p++);
151 87332 : val |= (c & 0x7F) << 14;
152 87332 : if (c & 0x80)
153 : {
154 : /* 4th byte */
155 2 : c = *(p++);
156 2 : val |= (c & 0x7F) << 21;
157 2 : if (c & 0x80)
158 : {
159 : /* 5th byte */
160 2 : c = *(p++);
161 2 : val |= (c & 0x7F) << 28;
162 2 : if (c & 0x80)
163 : {
164 : /* 6th byte */
165 2 : c = *(p++);
166 2 : val |= (c & 0x7F) << 35;
167 2 : if (c & 0x80)
168 : {
169 : /* 7th byte, should not have continuation bit */
170 2 : c = *(p++);
171 2 : val |= c << 42;
172 : Assert((c & 0x80) == 0);
173 : }
174 : }
175 : }
176 : }
177 : }
178 : }
179 :
180 34385800 : *ptr = p;
181 :
182 34385800 : return val;
183 : }
184 :
185 : /*
186 : * Encode a posting list.
187 : *
188 : * The encoded list is returned in a palloc'd struct, which will be at most
189 : * 'maxsize' bytes in size. The number items in the returned segment is
190 : * returned in *nwritten. If it's not equal to nipd, not all the items fit
191 : * in 'maxsize', and only the first *nwritten were encoded.
192 : *
193 : * The allocated size of the returned struct is short-aligned, and the padding
194 : * byte at the end, if any, is zero.
195 : */
196 : GinPostingList *
197 782474 : ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize,
198 : int *nwritten)
199 : {
200 : uint64 prev;
201 782474 : int totalpacked = 0;
202 : int maxbytes;
203 : GinPostingList *result;
204 : unsigned char *ptr;
205 : unsigned char *endptr;
206 :
207 782474 : maxsize = SHORTALIGN_DOWN(maxsize);
208 :
209 782474 : result = palloc(maxsize);
210 :
211 782474 : maxbytes = maxsize - offsetof(GinPostingList, bytes);
212 : Assert(maxbytes > 0);
213 :
214 : /* Store the first special item */
215 782474 : result->first = ipd[0];
216 :
217 782474 : prev = itemptr_to_uint64(&result->first);
218 :
219 782474 : ptr = result->bytes;
220 782474 : endptr = result->bytes + maxbytes;
221 33570330 : for (totalpacked = 1; totalpacked < nipd; totalpacked++)
222 : {
223 32792066 : uint64 val = itemptr_to_uint64(&ipd[totalpacked]);
224 32792066 : uint64 delta = val - prev;
225 :
226 : Assert(val > prev);
227 :
228 32792066 : if (endptr - ptr >= MaxBytesPerInteger)
229 32763160 : encode_varbyte(delta, &ptr);
230 : else
231 : {
232 : /*
233 : * There are less than 7 bytes left. Have to check if the next
234 : * item fits in that space before writing it out.
235 : */
236 : unsigned char buf[MaxBytesPerInteger];
237 28906 : unsigned char *p = buf;
238 :
239 28906 : encode_varbyte(delta, &p);
240 28906 : if (p - buf > (endptr - ptr))
241 4210 : break; /* output is full */
242 :
243 24696 : memcpy(ptr, buf, p - buf);
244 24696 : ptr += (p - buf);
245 : }
246 32787856 : prev = val;
247 : }
248 782474 : result->nbytes = ptr - result->bytes;
249 :
250 : /*
251 : * If we wrote an odd number of bytes, zero out the padding byte at the
252 : * end.
253 : */
254 782474 : if (result->nbytes != SHORTALIGN(result->nbytes))
255 60752 : result->bytes[result->nbytes] = 0;
256 :
257 782474 : if (nwritten)
258 60348 : *nwritten = totalpacked;
259 :
260 : Assert(SizeOfGinPostingList(result) <= maxsize);
261 :
262 : /*
263 : * Check that the encoded segment decodes back to the original items.
264 : */
265 : #if defined (CHECK_ENCODING_ROUNDTRIP)
266 : {
267 : int ndecoded;
268 : ItemPointer tmp = ginPostingListDecode(result, &ndecoded);
269 :
270 : Assert(ndecoded == totalpacked);
271 : Assert(memcmp(tmp, ipd, ndecoded * sizeof(ItemPointerData)) == 0);
272 : pfree(tmp);
273 : }
274 : #endif
275 :
276 782474 : return result;
277 : }
278 :
279 : /*
280 : * Decode a compressed posting list into an array of item pointers.
281 : * The number of items is returned in *ndecoded.
282 : */
283 : ItemPointer
284 621390 : ginPostingListDecode(GinPostingList *plist, int *ndecoded_out)
285 : {
286 1242780 : return ginPostingListDecodeAllSegments(plist,
287 621390 : SizeOfGinPostingList(plist),
288 : ndecoded_out);
289 : }
290 :
291 : /*
292 : * Decode multiple posting list segments into an array of item pointers.
293 : * The number of items is returned in *ndecoded_out. The segments are stored
294 : * one after each other, with total size 'len' bytes.
295 : */
296 : ItemPointer
297 621572 : ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out)
298 : {
299 : ItemPointer result;
300 : int nallocated;
301 : uint64 val;
302 621572 : char *endseg = ((char *) segment) + len;
303 : int ndecoded;
304 : unsigned char *ptr;
305 : unsigned char *endptr;
306 :
307 : /*
308 : * Guess an initial size of the array.
309 : */
310 621572 : nallocated = segment->nbytes * 2 + 1;
311 621572 : result = palloc(nallocated * sizeof(ItemPointerData));
312 :
313 621572 : ndecoded = 0;
314 1246314 : while ((char *) segment < endseg)
315 : {
316 : /* enlarge output array if needed */
317 624742 : if (ndecoded >= nallocated)
318 : {
319 0 : nallocated *= 2;
320 0 : result = repalloc(result, nallocated * sizeof(ItemPointerData));
321 : }
322 :
323 : /* copy the first item */
324 : Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first)));
325 : Assert(ndecoded == 0 || ginCompareItemPointers(&segment->first, &result[ndecoded - 1]) > 0);
326 624742 : result[ndecoded] = segment->first;
327 624742 : ndecoded++;
328 :
329 624742 : val = itemptr_to_uint64(&segment->first);
330 624742 : ptr = segment->bytes;
331 624742 : endptr = segment->bytes + segment->nbytes;
332 35010542 : while (ptr < endptr)
333 : {
334 : /* enlarge output array if needed */
335 34385800 : if (ndecoded >= nallocated)
336 : {
337 580 : nallocated *= 2;
338 580 : result = repalloc(result, nallocated * sizeof(ItemPointerData));
339 : }
340 :
341 34385800 : val += decode_varbyte(&ptr);
342 :
343 34385800 : uint64_to_itemptr(val, &result[ndecoded]);
344 34385800 : ndecoded++;
345 : }
346 624742 : segment = GinNextPostingListSegment(segment);
347 : }
348 :
349 621572 : if (ndecoded_out)
350 621572 : *ndecoded_out = ndecoded;
351 621572 : return result;
352 : }
353 :
354 : /*
355 : * Add all item pointers from a bunch of posting lists to a TIDBitmap.
356 : */
357 : int
358 30 : ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len,
359 : TIDBitmap *tbm)
360 : {
361 : int ndecoded;
362 : ItemPointer items;
363 :
364 30 : items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded);
365 30 : tbm_add_tuples(tbm, items, ndecoded, false);
366 30 : pfree(items);
367 :
368 30 : return ndecoded;
369 : }
370 :
371 : /*
372 : * Merge two ordered arrays of itempointers, eliminating any duplicates.
373 : *
374 : * Returns a palloc'd array, and *nmerged is set to the number of items in
375 : * the result, after eliminating duplicates.
376 : */
377 : ItemPointer
378 88494 : ginMergeItemPointers(ItemPointerData *a, uint32 na,
379 : ItemPointerData *b, uint32 nb,
380 : int *nmerged)
381 : {
382 : ItemPointerData *dst;
383 :
384 88494 : dst = (ItemPointer) palloc((na + nb) * sizeof(ItemPointerData));
385 :
386 : /*
387 : * If the argument arrays don't overlap, we can just append them to each
388 : * other.
389 : */
390 88494 : if (na == 0 || nb == 0 || ginCompareItemPointers(&a[na - 1], &b[0]) < 0)
391 : {
392 49188 : memcpy(dst, a, na * sizeof(ItemPointerData));
393 49188 : memcpy(&dst[na], b, nb * sizeof(ItemPointerData));
394 49188 : *nmerged = na + nb;
395 : }
396 39306 : else if (ginCompareItemPointers(&b[nb - 1], &a[0]) < 0)
397 : {
398 39306 : memcpy(dst, b, nb * sizeof(ItemPointerData));
399 39306 : memcpy(&dst[nb], a, na * sizeof(ItemPointerData));
400 39306 : *nmerged = na + nb;
401 : }
402 : else
403 : {
404 0 : ItemPointerData *dptr = dst;
405 0 : ItemPointerData *aptr = a;
406 0 : ItemPointerData *bptr = b;
407 :
408 0 : while (aptr - a < na && bptr - b < nb)
409 : {
410 0 : int cmp = ginCompareItemPointers(aptr, bptr);
411 :
412 0 : if (cmp > 0)
413 0 : *dptr++ = *bptr++;
414 0 : else if (cmp == 0)
415 : {
416 : /* only keep one copy of the identical items */
417 0 : *dptr++ = *bptr++;
418 0 : aptr++;
419 : }
420 : else
421 0 : *dptr++ = *aptr++;
422 : }
423 :
424 0 : while (aptr - a < na)
425 0 : *dptr++ = *aptr++;
426 :
427 0 : while (bptr - b < nb)
428 0 : *dptr++ = *bptr++;
429 :
430 0 : *nmerged = dptr - dst;
431 : }
432 :
433 88494 : return dst;
434 : }
|