Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pg_stat_statements.c
4 : * Track statement planning and execution times as well as resource
5 : * usage across a whole database cluster.
6 : *
7 : * Execution costs are totaled for each distinct source query, and kept in
8 : * a shared hashtable. (We track only as many distinct queries as will fit
9 : * in the designated amount of shared memory.)
10 : *
11 : * Starting in Postgres 9.2, this module normalized query entries. As of
12 : * Postgres 14, the normalization is done by the core if compute_query_id is
13 : * enabled, or optionally by third-party modules.
14 : *
15 : * To facilitate presenting entries to users, we create "representative" query
16 : * strings in which constants are replaced with parameter symbols ($n), to
17 : * make it clearer what a normalized entry can represent. To save on shared
18 : * memory, and to avoid having to truncate oversized query strings, we store
19 : * these strings in a temporary external query-texts file. Offsets into this
20 : * file are kept in shared memory.
21 : *
22 : * Note about locking issues: to create or delete an entry in the shared
23 : * hashtable, one must hold pgss->lock exclusively. Modifying any field
24 : * in an entry except the counters requires the same. To look up an entry,
25 : * one must hold the lock shared. To read or update the counters within
26 : * an entry, one must hold the lock shared or exclusive (so the entry doesn't
27 : * disappear!) and also take the entry's mutex spinlock.
28 : * The shared state variable pgss->extent (the next free spot in the external
29 : * query-text file) should be accessed only while holding either the
30 : * pgss->mutex spinlock, or exclusive lock on pgss->lock. We use the mutex to
31 : * allow reserving file space while holding only shared lock on pgss->lock.
32 : * Rewriting the entire external query-text file, eg for garbage collection,
33 : * requires holding pgss->lock exclusively; this allows individual entries
34 : * in the file to be read or written while holding only shared lock.
35 : *
36 : *
37 : * Copyright (c) 2008-2023, PostgreSQL Global Development Group
38 : *
39 : * IDENTIFICATION
40 : * contrib/pg_stat_statements/pg_stat_statements.c
41 : *
42 : *-------------------------------------------------------------------------
43 : */
44 : #include "postgres.h"
45 :
46 : #include <math.h>
47 : #include <sys/stat.h>
48 : #include <unistd.h>
49 :
50 : #include "access/parallel.h"
51 : #include "catalog/pg_authid.h"
52 : #include "common/hashfn.h"
53 : #include "executor/instrument.h"
54 : #include "funcapi.h"
55 : #include "jit/jit.h"
56 : #include "mb/pg_wchar.h"
57 : #include "miscadmin.h"
58 : #include "nodes/queryjumble.h"
59 : #include "optimizer/planner.h"
60 : #include "parser/analyze.h"
61 : #include "parser/parsetree.h"
62 : #include "parser/scanner.h"
63 : #include "parser/scansup.h"
64 : #include "pgstat.h"
65 : #include "storage/fd.h"
66 : #include "storage/ipc.h"
67 : #include "storage/lwlock.h"
68 : #include "storage/shmem.h"
69 : #include "storage/spin.h"
70 : #include "tcop/utility.h"
71 : #include "utils/acl.h"
72 : #include "utils/builtins.h"
73 : #include "utils/memutils.h"
74 : #include "utils/timestamp.h"
75 :
76 8 : PG_MODULE_MAGIC;
77 :
78 : /* Location of permanent stats file (valid when database is shut down) */
79 : #define PGSS_DUMP_FILE PGSTAT_STAT_PERMANENT_DIRECTORY "/pg_stat_statements.stat"
80 :
81 : /*
82 : * Location of external query text file.
83 : */
84 : #define PGSS_TEXT_FILE PG_STAT_TMP_DIR "/pgss_query_texts.stat"
85 :
86 : /* Magic number identifying the stats file format */
87 : static const uint32 PGSS_FILE_HEADER = 0x20220408;
88 :
89 : /* PostgreSQL major version number, changes in which invalidate all entries */
90 : static const uint32 PGSS_PG_MAJOR_VERSION = PG_VERSION_NUM / 100;
91 :
92 : /* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */
93 : #define USAGE_EXEC(duration) (1.0)
94 : #define USAGE_INIT (1.0) /* including initial planning */
95 : #define ASSUMED_MEDIAN_INIT (10.0) /* initial assumed median usage */
96 : #define ASSUMED_LENGTH_INIT 1024 /* initial assumed mean query length */
97 : #define USAGE_DECREASE_FACTOR (0.99) /* decreased every entry_dealloc */
98 : #define STICKY_DECREASE_FACTOR (0.50) /* factor for sticky entries */
99 : #define USAGE_DEALLOC_PERCENT 5 /* free this % of entries at once */
100 : #define IS_STICKY(c) ((c.calls[PGSS_PLAN] + c.calls[PGSS_EXEC]) == 0)
101 :
102 : /*
103 : * Utility statements that pgss_ProcessUtility and pgss_post_parse_analyze
104 : * ignores.
105 : */
106 : #define PGSS_HANDLED_UTILITY(n) (!IsA(n, ExecuteStmt) && \
107 : !IsA(n, PrepareStmt) && \
108 : !IsA(n, DeallocateStmt))
109 :
110 : /*
111 : * Extension version number, for supporting older extension versions' objects
112 : */
113 : typedef enum pgssVersion
114 : {
115 : PGSS_V1_0 = 0,
116 : PGSS_V1_1,
117 : PGSS_V1_2,
118 : PGSS_V1_3,
119 : PGSS_V1_8,
120 : PGSS_V1_9,
121 : PGSS_V1_10
122 : } pgssVersion;
123 :
124 : typedef enum pgssStoreKind
125 : {
126 : PGSS_INVALID = -1,
127 :
128 : /*
129 : * PGSS_PLAN and PGSS_EXEC must be respectively 0 and 1 as they're used to
130 : * reference the underlying values in the arrays in the Counters struct,
131 : * and this order is required in pg_stat_statements_internal().
132 : */
133 : PGSS_PLAN = 0,
134 : PGSS_EXEC,
135 :
136 : PGSS_NUMKIND /* Must be last value of this enum */
137 : } pgssStoreKind;
138 :
139 : /*
140 : * Hashtable key that defines the identity of a hashtable entry. We separate
141 : * queries by user and by database even if they are otherwise identical.
142 : *
143 : * If you add a new key to this struct, make sure to teach pgss_store() to
144 : * zero the padding bytes. Otherwise, things will break, because pgss_hash is
145 : * created using HASH_BLOBS, and thus tag_hash is used to hash this.
146 :
147 : */
148 : typedef struct pgssHashKey
149 : {
150 : Oid userid; /* user OID */
151 : Oid dbid; /* database OID */
152 : uint64 queryid; /* query identifier */
153 : bool toplevel; /* query executed at top level */
154 : } pgssHashKey;
155 :
156 : /*
157 : * The actual stats counters kept within pgssEntry.
158 : */
159 : typedef struct Counters
160 : {
161 : int64 calls[PGSS_NUMKIND]; /* # of times planned/executed */
162 : double total_time[PGSS_NUMKIND]; /* total planning/execution time,
163 : * in msec */
164 : double min_time[PGSS_NUMKIND]; /* minimum planning/execution time in
165 : * msec */
166 : double max_time[PGSS_NUMKIND]; /* maximum planning/execution time in
167 : * msec */
168 : double mean_time[PGSS_NUMKIND]; /* mean planning/execution time in
169 : * msec */
170 : double sum_var_time[PGSS_NUMKIND]; /* sum of variances in
171 : * planning/execution time in msec */
172 : int64 rows; /* total # of retrieved or affected rows */
173 : int64 shared_blks_hit; /* # of shared buffer hits */
174 : int64 shared_blks_read; /* # of shared disk blocks read */
175 : int64 shared_blks_dirtied; /* # of shared disk blocks dirtied */
176 : int64 shared_blks_written; /* # of shared disk blocks written */
177 : int64 local_blks_hit; /* # of local buffer hits */
178 : int64 local_blks_read; /* # of local disk blocks read */
179 : int64 local_blks_dirtied; /* # of local disk blocks dirtied */
180 : int64 local_blks_written; /* # of local disk blocks written */
181 : int64 temp_blks_read; /* # of temp blocks read */
182 : int64 temp_blks_written; /* # of temp blocks written */
183 : double blk_read_time; /* time spent reading blocks, in msec */
184 : double blk_write_time; /* time spent writing blocks, in msec */
185 : double temp_blk_read_time; /* time spent reading temp blocks, in msec */
186 : double temp_blk_write_time; /* time spent writing temp blocks, in
187 : * msec */
188 : double usage; /* usage factor */
189 : int64 wal_records; /* # of WAL records generated */
190 : int64 wal_fpi; /* # of WAL full page images generated */
191 : uint64 wal_bytes; /* total amount of WAL generated in bytes */
192 : int64 jit_functions; /* total number of JIT functions emitted */
193 : double jit_generation_time; /* total time to generate jit code */
194 : int64 jit_inlining_count; /* number of times inlining time has been
195 : * > 0 */
196 : double jit_inlining_time; /* total time to inline jit code */
197 : int64 jit_optimization_count; /* number of times optimization time
198 : * has been > 0 */
199 : double jit_optimization_time; /* total time to optimize jit code */
200 : int64 jit_emission_count; /* number of times emission time has been
201 : * > 0 */
202 : double jit_emission_time; /* total time to emit jit code */
203 : } Counters;
204 :
205 : /*
206 : * Global statistics for pg_stat_statements
207 : */
208 : typedef struct pgssGlobalStats
209 : {
210 : int64 dealloc; /* # of times entries were deallocated */
211 : TimestampTz stats_reset; /* timestamp with all stats reset */
212 : } pgssGlobalStats;
213 :
214 : /*
215 : * Statistics per statement
216 : *
217 : * Note: in event of a failure in garbage collection of the query text file,
218 : * we reset query_offset to zero and query_len to -1. This will be seen as
219 : * an invalid state by qtext_fetch().
220 : */
221 : typedef struct pgssEntry
222 : {
223 : pgssHashKey key; /* hash key of entry - MUST BE FIRST */
224 : Counters counters; /* the statistics for this query */
225 : Size query_offset; /* query text offset in external file */
226 : int query_len; /* # of valid bytes in query string, or -1 */
227 : int encoding; /* query text encoding */
228 : slock_t mutex; /* protects the counters only */
229 : } pgssEntry;
230 :
231 : /*
232 : * Global shared state
233 : */
234 : typedef struct pgssSharedState
235 : {
236 : LWLock *lock; /* protects hashtable search/modification */
237 : double cur_median_usage; /* current median usage in hashtable */
238 : Size mean_query_len; /* current mean entry text length */
239 : slock_t mutex; /* protects following fields only: */
240 : Size extent; /* current extent of query file */
241 : int n_writers; /* number of active writers to query file */
242 : int gc_count; /* query file garbage collection cycle count */
243 : pgssGlobalStats stats; /* global statistics for pgss */
244 : } pgssSharedState;
245 :
246 : /*---- Local variables ----*/
247 :
248 : /* Current nesting depth of ExecutorRun+ProcessUtility calls */
249 : static int exec_nested_level = 0;
250 :
251 : /* Current nesting depth of planner calls */
252 : static int plan_nested_level = 0;
253 :
254 : /* Saved hook values in case of unload */
255 : static shmem_request_hook_type prev_shmem_request_hook = NULL;
256 : static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
257 : static post_parse_analyze_hook_type prev_post_parse_analyze_hook = NULL;
258 : static planner_hook_type prev_planner_hook = NULL;
259 : static ExecutorStart_hook_type prev_ExecutorStart = NULL;
260 : static ExecutorRun_hook_type prev_ExecutorRun = NULL;
261 : static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
262 : static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
263 : static ProcessUtility_hook_type prev_ProcessUtility = NULL;
264 :
265 : /* Links to shared memory state */
266 : static pgssSharedState *pgss = NULL;
267 : static HTAB *pgss_hash = NULL;
268 :
269 : /*---- GUC variables ----*/
270 :
271 : typedef enum
272 : {
273 : PGSS_TRACK_NONE, /* track no statements */
274 : PGSS_TRACK_TOP, /* only top level statements */
275 : PGSS_TRACK_ALL /* all statements, including nested ones */
276 : } PGSSTrackLevel;
277 :
278 : static const struct config_enum_entry track_options[] =
279 : {
280 : {"none", PGSS_TRACK_NONE, false},
281 : {"top", PGSS_TRACK_TOP, false},
282 : {"all", PGSS_TRACK_ALL, false},
283 : {NULL, 0, false}
284 : };
285 :
286 : static int pgss_max = 5000; /* max # statements to track */
287 : static int pgss_track = PGSS_TRACK_TOP; /* tracking level */
288 : static bool pgss_track_utility = true; /* whether to track utility commands */
289 : static bool pgss_track_planning = false; /* whether to track planning
290 : * duration */
291 : static bool pgss_save = true; /* whether to save stats across shutdown */
292 :
293 :
294 : #define pgss_enabled(level) \
295 : (!IsParallelWorker() && \
296 : (pgss_track == PGSS_TRACK_ALL || \
297 : (pgss_track == PGSS_TRACK_TOP && (level) == 0)))
298 :
299 : #define record_gc_qtexts() \
300 : do { \
301 : volatile pgssSharedState *s = (volatile pgssSharedState *) pgss; \
302 : SpinLockAcquire(&s->mutex); \
303 : s->gc_count++; \
304 : SpinLockRelease(&s->mutex); \
305 : } while(0)
306 :
307 : /*---- Function declarations ----*/
308 :
309 10 : PG_FUNCTION_INFO_V1(pg_stat_statements_reset);
310 24 : PG_FUNCTION_INFO_V1(pg_stat_statements_reset_1_7);
311 0 : PG_FUNCTION_INFO_V1(pg_stat_statements_1_2);
312 10 : PG_FUNCTION_INFO_V1(pg_stat_statements_1_3);
313 8 : PG_FUNCTION_INFO_V1(pg_stat_statements_1_8);
314 10 : PG_FUNCTION_INFO_V1(pg_stat_statements_1_9);
315 28 : PG_FUNCTION_INFO_V1(pg_stat_statements_1_10);
316 0 : PG_FUNCTION_INFO_V1(pg_stat_statements);
317 10 : PG_FUNCTION_INFO_V1(pg_stat_statements_info);
318 :
319 : static void pgss_shmem_request(void);
320 : static void pgss_shmem_startup(void);
321 : static void pgss_shmem_shutdown(int code, Datum arg);
322 : static void pgss_post_parse_analyze(ParseState *pstate, Query *query,
323 : JumbleState *jstate);
324 : static PlannedStmt *pgss_planner(Query *parse,
325 : const char *query_string,
326 : int cursorOptions,
327 : ParamListInfo boundParams);
328 : static void pgss_ExecutorStart(QueryDesc *queryDesc, int eflags);
329 : static void pgss_ExecutorRun(QueryDesc *queryDesc,
330 : ScanDirection direction,
331 : uint64 count, bool execute_once);
332 : static void pgss_ExecutorFinish(QueryDesc *queryDesc);
333 : static void pgss_ExecutorEnd(QueryDesc *queryDesc);
334 : static void pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
335 : bool readOnlyTree,
336 : ProcessUtilityContext context, ParamListInfo params,
337 : QueryEnvironment *queryEnv,
338 : DestReceiver *dest, QueryCompletion *qc);
339 : static void pgss_store(const char *query, uint64 queryId,
340 : int query_location, int query_len,
341 : pgssStoreKind kind,
342 : double total_time, uint64 rows,
343 : const BufferUsage *bufusage,
344 : const WalUsage *walusage,
345 : const struct JitInstrumentation *jitusage,
346 : JumbleState *jstate);
347 : static void pg_stat_statements_internal(FunctionCallInfo fcinfo,
348 : pgssVersion api_version,
349 : bool showtext);
350 : static Size pgss_memsize(void);
351 : static pgssEntry *entry_alloc(pgssHashKey *key, Size query_offset, int query_len,
352 : int encoding, bool sticky);
353 : static void entry_dealloc(void);
354 : static bool qtext_store(const char *query, int query_len,
355 : Size *query_offset, int *gc_count);
356 : static char *qtext_load_file(Size *buffer_size);
357 : static char *qtext_fetch(Size query_offset, int query_len,
358 : char *buffer, Size buffer_size);
359 : static bool need_gc_qtexts(void);
360 : static void gc_qtexts(void);
361 : static void entry_reset(Oid userid, Oid dbid, uint64 queryid);
362 : static char *generate_normalized_query(JumbleState *jstate, const char *query,
363 : int query_loc, int *query_len_p);
364 : static void fill_in_constant_lengths(JumbleState *jstate, const char *query,
365 : int query_loc);
366 : static int comp_location(const void *a, const void *b);
367 :
368 :
369 : /*
370 : * Module load callback
371 : */
372 : void
373 8 : _PG_init(void)
374 : {
375 : /*
376 : * In order to create our shared memory area, we have to be loaded via
377 : * shared_preload_libraries. If not, fall out without hooking into any of
378 : * the main system. (We don't throw error here because it seems useful to
379 : * allow the pg_stat_statements functions to be created even when the
380 : * module isn't active. The functions must protect themselves against
381 : * being called then, however.)
382 : */
383 8 : if (!process_shared_preload_libraries_in_progress)
384 2 : return;
385 :
386 : /*
387 : * Inform the postmaster that we want to enable query_id calculation if
388 : * compute_query_id is set to auto.
389 : */
390 6 : EnableQueryId();
391 :
392 : /*
393 : * Define (or redefine) custom GUC variables.
394 : */
395 6 : DefineCustomIntVariable("pg_stat_statements.max",
396 : "Sets the maximum number of statements tracked by pg_stat_statements.",
397 : NULL,
398 : &pgss_max,
399 : 5000,
400 : 100,
401 : INT_MAX / 2,
402 : PGC_POSTMASTER,
403 : 0,
404 : NULL,
405 : NULL,
406 : NULL);
407 :
408 6 : DefineCustomEnumVariable("pg_stat_statements.track",
409 : "Selects which statements are tracked by pg_stat_statements.",
410 : NULL,
411 : &pgss_track,
412 : PGSS_TRACK_TOP,
413 : track_options,
414 : PGC_SUSET,
415 : 0,
416 : NULL,
417 : NULL,
418 : NULL);
419 :
420 6 : DefineCustomBoolVariable("pg_stat_statements.track_utility",
421 : "Selects whether utility commands are tracked by pg_stat_statements.",
422 : NULL,
423 : &pgss_track_utility,
424 : true,
425 : PGC_SUSET,
426 : 0,
427 : NULL,
428 : NULL,
429 : NULL);
430 :
431 6 : DefineCustomBoolVariable("pg_stat_statements.track_planning",
432 : "Selects whether planning duration is tracked by pg_stat_statements.",
433 : NULL,
434 : &pgss_track_planning,
435 : false,
436 : PGC_SUSET,
437 : 0,
438 : NULL,
439 : NULL,
440 : NULL);
441 :
442 6 : DefineCustomBoolVariable("pg_stat_statements.save",
443 : "Save pg_stat_statements statistics across server shutdowns.",
444 : NULL,
445 : &pgss_save,
446 : true,
447 : PGC_SIGHUP,
448 : 0,
449 : NULL,
450 : NULL,
451 : NULL);
452 :
453 6 : MarkGUCPrefixReserved("pg_stat_statements");
454 :
455 : /*
456 : * Install hooks.
457 : */
458 6 : prev_shmem_request_hook = shmem_request_hook;
459 6 : shmem_request_hook = pgss_shmem_request;
460 6 : prev_shmem_startup_hook = shmem_startup_hook;
461 6 : shmem_startup_hook = pgss_shmem_startup;
462 6 : prev_post_parse_analyze_hook = post_parse_analyze_hook;
463 6 : post_parse_analyze_hook = pgss_post_parse_analyze;
464 6 : prev_planner_hook = planner_hook;
465 6 : planner_hook = pgss_planner;
466 6 : prev_ExecutorStart = ExecutorStart_hook;
467 6 : ExecutorStart_hook = pgss_ExecutorStart;
468 6 : prev_ExecutorRun = ExecutorRun_hook;
469 6 : ExecutorRun_hook = pgss_ExecutorRun;
470 6 : prev_ExecutorFinish = ExecutorFinish_hook;
471 6 : ExecutorFinish_hook = pgss_ExecutorFinish;
472 6 : prev_ExecutorEnd = ExecutorEnd_hook;
473 6 : ExecutorEnd_hook = pgss_ExecutorEnd;
474 6 : prev_ProcessUtility = ProcessUtility_hook;
475 6 : ProcessUtility_hook = pgss_ProcessUtility;
476 : }
477 :
478 : /*
479 : * shmem_request hook: request additional shared resources. We'll allocate or
480 : * attach to the shared resources in pgss_shmem_startup().
481 : */
482 : static void
483 6 : pgss_shmem_request(void)
484 : {
485 6 : if (prev_shmem_request_hook)
486 0 : prev_shmem_request_hook();
487 :
488 6 : RequestAddinShmemSpace(pgss_memsize());
489 6 : RequestNamedLWLockTranche("pg_stat_statements", 1);
490 6 : }
491 :
492 : /*
493 : * shmem_startup hook: allocate or attach to shared memory,
494 : * then load any pre-existing statistics from file.
495 : * Also create and load the query-texts file, which is expected to exist
496 : * (even if empty) while the module is enabled.
497 : */
498 : static void
499 6 : pgss_shmem_startup(void)
500 : {
501 : bool found;
502 : HASHCTL info;
503 6 : FILE *file = NULL;
504 6 : FILE *qfile = NULL;
505 : uint32 header;
506 : int32 num;
507 : int32 pgver;
508 : int32 i;
509 : int buffer_size;
510 6 : char *buffer = NULL;
511 :
512 6 : if (prev_shmem_startup_hook)
513 0 : prev_shmem_startup_hook();
514 :
515 : /* reset in case this is a restart within the postmaster */
516 6 : pgss = NULL;
517 6 : pgss_hash = NULL;
518 :
519 : /*
520 : * Create or attach to the shared memory state, including hash table
521 : */
522 6 : LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
523 :
524 6 : pgss = ShmemInitStruct("pg_stat_statements",
525 : sizeof(pgssSharedState),
526 : &found);
527 :
528 6 : if (!found)
529 : {
530 : /* First time through ... */
531 6 : pgss->lock = &(GetNamedLWLockTranche("pg_stat_statements"))->lock;
532 6 : pgss->cur_median_usage = ASSUMED_MEDIAN_INIT;
533 6 : pgss->mean_query_len = ASSUMED_LENGTH_INIT;
534 6 : SpinLockInit(&pgss->mutex);
535 6 : pgss->extent = 0;
536 6 : pgss->n_writers = 0;
537 6 : pgss->gc_count = 0;
538 6 : pgss->stats.dealloc = 0;
539 6 : pgss->stats.stats_reset = GetCurrentTimestamp();
540 : }
541 :
542 6 : info.keysize = sizeof(pgssHashKey);
543 6 : info.entrysize = sizeof(pgssEntry);
544 6 : pgss_hash = ShmemInitHash("pg_stat_statements hash",
545 : pgss_max, pgss_max,
546 : &info,
547 : HASH_ELEM | HASH_BLOBS);
548 :
549 6 : LWLockRelease(AddinShmemInitLock);
550 :
551 : /*
552 : * If we're in the postmaster (or a standalone backend...), set up a shmem
553 : * exit hook to dump the statistics to disk.
554 : */
555 6 : if (!IsUnderPostmaster)
556 6 : on_shmem_exit(pgss_shmem_shutdown, (Datum) 0);
557 :
558 : /*
559 : * Done if some other process already completed our initialization.
560 : */
561 6 : if (found)
562 6 : return;
563 :
564 : /*
565 : * Note: we don't bother with locks here, because there should be no other
566 : * processes running when this code is reached.
567 : */
568 :
569 : /* Unlink query text file possibly left over from crash */
570 6 : unlink(PGSS_TEXT_FILE);
571 :
572 : /* Allocate new query text temp file */
573 6 : qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
574 6 : if (qfile == NULL)
575 0 : goto write_error;
576 :
577 : /*
578 : * If we were told not to load old statistics, we're done. (Note we do
579 : * not try to unlink any old dump file in this case. This seems a bit
580 : * questionable but it's the historical behavior.)
581 : */
582 6 : if (!pgss_save)
583 : {
584 0 : FreeFile(qfile);
585 0 : return;
586 : }
587 :
588 : /*
589 : * Attempt to load old statistics from the dump file.
590 : */
591 6 : file = AllocateFile(PGSS_DUMP_FILE, PG_BINARY_R);
592 6 : if (file == NULL)
593 : {
594 6 : if (errno != ENOENT)
595 0 : goto read_error;
596 : /* No existing persisted stats file, so we're done */
597 6 : FreeFile(qfile);
598 6 : return;
599 : }
600 :
601 0 : buffer_size = 2048;
602 0 : buffer = (char *) palloc(buffer_size);
603 :
604 0 : if (fread(&header, sizeof(uint32), 1, file) != 1 ||
605 0 : fread(&pgver, sizeof(uint32), 1, file) != 1 ||
606 0 : fread(&num, sizeof(int32), 1, file) != 1)
607 0 : goto read_error;
608 :
609 0 : if (header != PGSS_FILE_HEADER ||
610 0 : pgver != PGSS_PG_MAJOR_VERSION)
611 0 : goto data_error;
612 :
613 0 : for (i = 0; i < num; i++)
614 : {
615 : pgssEntry temp;
616 : pgssEntry *entry;
617 : Size query_offset;
618 :
619 0 : if (fread(&temp, sizeof(pgssEntry), 1, file) != 1)
620 0 : goto read_error;
621 :
622 : /* Encoding is the only field we can easily sanity-check */
623 0 : if (!PG_VALID_BE_ENCODING(temp.encoding))
624 0 : goto data_error;
625 :
626 : /* Resize buffer as needed */
627 0 : if (temp.query_len >= buffer_size)
628 : {
629 0 : buffer_size = Max(buffer_size * 2, temp.query_len + 1);
630 0 : buffer = repalloc(buffer, buffer_size);
631 : }
632 :
633 0 : if (fread(buffer, 1, temp.query_len + 1, file) != temp.query_len + 1)
634 0 : goto read_error;
635 :
636 : /* Should have a trailing null, but let's make sure */
637 0 : buffer[temp.query_len] = '\0';
638 :
639 : /* Skip loading "sticky" entries */
640 0 : if (IS_STICKY(temp.counters))
641 0 : continue;
642 :
643 : /* Store the query text */
644 0 : query_offset = pgss->extent;
645 0 : if (fwrite(buffer, 1, temp.query_len + 1, qfile) != temp.query_len + 1)
646 0 : goto write_error;
647 0 : pgss->extent += temp.query_len + 1;
648 :
649 : /* make the hashtable entry (discards old entries if too many) */
650 0 : entry = entry_alloc(&temp.key, query_offset, temp.query_len,
651 : temp.encoding,
652 : false);
653 :
654 : /* copy in the actual stats */
655 0 : entry->counters = temp.counters;
656 : }
657 :
658 : /* Read global statistics for pg_stat_statements */
659 0 : if (fread(&pgss->stats, sizeof(pgssGlobalStats), 1, file) != 1)
660 0 : goto read_error;
661 :
662 0 : pfree(buffer);
663 0 : FreeFile(file);
664 0 : FreeFile(qfile);
665 :
666 : /*
667 : * Remove the persisted stats file so it's not included in
668 : * backups/replication standbys, etc. A new file will be written on next
669 : * shutdown.
670 : *
671 : * Note: it's okay if the PGSS_TEXT_FILE is included in a basebackup,
672 : * because we remove that file on startup; it acts inversely to
673 : * PGSS_DUMP_FILE, in that it is only supposed to be around when the
674 : * server is running, whereas PGSS_DUMP_FILE is only supposed to be around
675 : * when the server is not running. Leaving the file creates no danger of
676 : * a newly restored database having a spurious record of execution costs,
677 : * which is what we're really concerned about here.
678 : */
679 0 : unlink(PGSS_DUMP_FILE);
680 :
681 0 : return;
682 :
683 0 : read_error:
684 0 : ereport(LOG,
685 : (errcode_for_file_access(),
686 : errmsg("could not read file \"%s\": %m",
687 : PGSS_DUMP_FILE)));
688 0 : goto fail;
689 0 : data_error:
690 0 : ereport(LOG,
691 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
692 : errmsg("ignoring invalid data in file \"%s\"",
693 : PGSS_DUMP_FILE)));
694 0 : goto fail;
695 0 : write_error:
696 0 : ereport(LOG,
697 : (errcode_for_file_access(),
698 : errmsg("could not write file \"%s\": %m",
699 : PGSS_TEXT_FILE)));
700 0 : fail:
701 0 : if (buffer)
702 0 : pfree(buffer);
703 0 : if (file)
704 0 : FreeFile(file);
705 0 : if (qfile)
706 0 : FreeFile(qfile);
707 : /* If possible, throw away the bogus file; ignore any error */
708 0 : unlink(PGSS_DUMP_FILE);
709 :
710 : /*
711 : * Don't unlink PGSS_TEXT_FILE here; it should always be around while the
712 : * server is running with pg_stat_statements enabled
713 : */
714 : }
715 :
716 : /*
717 : * shmem_shutdown hook: Dump statistics into file.
718 : *
719 : * Note: we don't bother with acquiring lock, because there should be no
720 : * other processes running when this is called.
721 : */
722 : static void
723 6 : pgss_shmem_shutdown(int code, Datum arg)
724 : {
725 : FILE *file;
726 6 : char *qbuffer = NULL;
727 6 : Size qbuffer_size = 0;
728 : HASH_SEQ_STATUS hash_seq;
729 : int32 num_entries;
730 : pgssEntry *entry;
731 :
732 : /* Don't try to dump during a crash. */
733 6 : if (code)
734 6 : return;
735 :
736 : /* Safety check ... shouldn't get here unless shmem is set up. */
737 6 : if (!pgss || !pgss_hash)
738 0 : return;
739 :
740 : /* Don't dump if told not to. */
741 6 : if (!pgss_save)
742 0 : return;
743 :
744 6 : file = AllocateFile(PGSS_DUMP_FILE ".tmp", PG_BINARY_W);
745 6 : if (file == NULL)
746 0 : goto error;
747 :
748 6 : if (fwrite(&PGSS_FILE_HEADER, sizeof(uint32), 1, file) != 1)
749 0 : goto error;
750 6 : if (fwrite(&PGSS_PG_MAJOR_VERSION, sizeof(uint32), 1, file) != 1)
751 0 : goto error;
752 6 : num_entries = hash_get_num_entries(pgss_hash);
753 6 : if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
754 0 : goto error;
755 :
756 6 : qbuffer = qtext_load_file(&qbuffer_size);
757 6 : if (qbuffer == NULL)
758 0 : goto error;
759 :
760 : /*
761 : * When serializing to disk, we store query texts immediately after their
762 : * entry data. Any orphaned query texts are thereby excluded.
763 : */
764 6 : hash_seq_init(&hash_seq, pgss_hash);
765 48798 : while ((entry = hash_seq_search(&hash_seq)) != NULL)
766 : {
767 48792 : int len = entry->query_len;
768 48792 : char *qstr = qtext_fetch(entry->query_offset, len,
769 : qbuffer, qbuffer_size);
770 :
771 48792 : if (qstr == NULL)
772 0 : continue; /* Ignore any entries with bogus texts */
773 :
774 48792 : if (fwrite(entry, sizeof(pgssEntry), 1, file) != 1 ||
775 48792 : fwrite(qstr, 1, len + 1, file) != len + 1)
776 : {
777 : /* note: we assume hash_seq_term won't change errno */
778 0 : hash_seq_term(&hash_seq);
779 0 : goto error;
780 : }
781 : }
782 :
783 : /* Dump global statistics for pg_stat_statements */
784 6 : if (fwrite(&pgss->stats, sizeof(pgssGlobalStats), 1, file) != 1)
785 0 : goto error;
786 :
787 6 : free(qbuffer);
788 6 : qbuffer = NULL;
789 :
790 6 : if (FreeFile(file))
791 : {
792 0 : file = NULL;
793 0 : goto error;
794 : }
795 :
796 : /*
797 : * Rename file into place, so we atomically replace any old one.
798 : */
799 6 : (void) durable_rename(PGSS_DUMP_FILE ".tmp", PGSS_DUMP_FILE, LOG);
800 :
801 : /* Unlink query-texts file; it's not needed while shutdown */
802 6 : unlink(PGSS_TEXT_FILE);
803 :
804 6 : return;
805 :
806 0 : error:
807 0 : ereport(LOG,
808 : (errcode_for_file_access(),
809 : errmsg("could not write file \"%s\": %m",
810 : PGSS_DUMP_FILE ".tmp")));
811 0 : free(qbuffer);
812 0 : if (file)
813 0 : FreeFile(file);
814 0 : unlink(PGSS_DUMP_FILE ".tmp");
815 0 : unlink(PGSS_TEXT_FILE);
816 : }
817 :
818 : /*
819 : * Post-parse-analysis hook: mark query with a queryId
820 : */
821 : static void
822 120838 : pgss_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate)
823 : {
824 120838 : if (prev_post_parse_analyze_hook)
825 0 : prev_post_parse_analyze_hook(pstate, query, jstate);
826 :
827 : /* Safety check... */
828 120838 : if (!pgss || !pgss_hash || !pgss_enabled(exec_nested_level))
829 22930 : return;
830 :
831 : /*
832 : * Clear queryId for prepared statements related utility, as those will
833 : * inherit from the underlying statement's one (except DEALLOCATE which is
834 : * entirely untracked).
835 : */
836 97908 : if (query->utilityStmt)
837 : {
838 45922 : if (pgss_track_utility && !PGSS_HANDLED_UTILITY(query->utilityStmt))
839 : {
840 2968 : query->queryId = UINT64CONST(0);
841 2968 : return;
842 : }
843 : }
844 :
845 : /*
846 : * If query jumbling were able to identify any ignorable constants, we
847 : * immediately create a hash table entry for the query, so that we can
848 : * record the normalized form of the query string. If there were no such
849 : * constants, the normalized string would be the same as the query text
850 : * anyway, so there's no need for an early entry.
851 : */
852 94940 : if (jstate && jstate->clocations_count > 0)
853 46846 : pgss_store(pstate->p_sourcetext,
854 : query->queryId,
855 : query->stmt_location,
856 : query->stmt_len,
857 : PGSS_INVALID,
858 : 0,
859 : 0,
860 : NULL,
861 : NULL,
862 : NULL,
863 : jstate);
864 : }
865 :
866 : /*
867 : * Planner hook: forward to regular planner, but measure planning time
868 : * if needed.
869 : */
870 : static PlannedStmt *
871 73524 : pgss_planner(Query *parse,
872 : const char *query_string,
873 : int cursorOptions,
874 : ParamListInfo boundParams)
875 : {
876 : PlannedStmt *result;
877 :
878 : /*
879 : * We can't process the query if no query_string is provided, as
880 : * pgss_store needs it. We also ignore query without queryid, as it would
881 : * be treated as a utility statement, which may not be the case.
882 : *
883 : * Note that planner_hook can be called from the planner itself, so we
884 : * have a specific nesting level for the planner. However, utility
885 : * commands containing optimizable statements can also call the planner,
886 : * same for regular DML (for instance for underlying foreign key queries).
887 : * So testing the planner nesting level only is not enough to detect real
888 : * top level planner call.
889 : */
890 73524 : if (pgss_enabled(plan_nested_level + exec_nested_level)
891 52296 : && pgss_track_planning && query_string
892 118 : && parse->queryId != UINT64CONST(0))
893 116 : {
894 : instr_time start;
895 : instr_time duration;
896 : BufferUsage bufusage_start,
897 : bufusage;
898 : WalUsage walusage_start,
899 : walusage;
900 :
901 : /* We need to track buffer usage as the planner can access them. */
902 116 : bufusage_start = pgBufferUsage;
903 :
904 : /*
905 : * Similarly the planner could write some WAL records in some cases
906 : * (e.g. setting a hint bit with those being WAL-logged)
907 : */
908 116 : walusage_start = pgWalUsage;
909 116 : INSTR_TIME_SET_CURRENT(start);
910 :
911 116 : plan_nested_level++;
912 116 : PG_TRY();
913 : {
914 116 : if (prev_planner_hook)
915 0 : result = prev_planner_hook(parse, query_string, cursorOptions,
916 : boundParams);
917 : else
918 116 : result = standard_planner(parse, query_string, cursorOptions,
919 : boundParams);
920 : }
921 0 : PG_FINALLY();
922 : {
923 116 : plan_nested_level--;
924 : }
925 116 : PG_END_TRY();
926 :
927 116 : INSTR_TIME_SET_CURRENT(duration);
928 116 : INSTR_TIME_SUBTRACT(duration, start);
929 :
930 : /* calc differences of buffer counters. */
931 116 : memset(&bufusage, 0, sizeof(BufferUsage));
932 116 : BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
933 :
934 : /* calc differences of WAL counters. */
935 116 : memset(&walusage, 0, sizeof(WalUsage));
936 116 : WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
937 :
938 116 : pgss_store(query_string,
939 : parse->queryId,
940 : parse->stmt_location,
941 : parse->stmt_len,
942 : PGSS_PLAN,
943 116 : INSTR_TIME_GET_MILLISEC(duration),
944 : 0,
945 : &bufusage,
946 : &walusage,
947 : NULL,
948 : NULL);
949 : }
950 : else
951 : {
952 73408 : if (prev_planner_hook)
953 0 : result = prev_planner_hook(parse, query_string, cursorOptions,
954 : boundParams);
955 : else
956 73408 : result = standard_planner(parse, query_string, cursorOptions,
957 : boundParams);
958 : }
959 :
960 72502 : return result;
961 : }
962 :
963 : /*
964 : * ExecutorStart hook: start up tracking if needed
965 : */
966 : static void
967 86926 : pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
968 : {
969 86926 : if (prev_ExecutorStart)
970 0 : prev_ExecutorStart(queryDesc, eflags);
971 : else
972 86926 : standard_ExecutorStart(queryDesc, eflags);
973 :
974 : /*
975 : * If query has queryId zero, don't track it. This prevents double
976 : * counting of optimizable statements that are directly contained in
977 : * utility statements.
978 : */
979 86400 : if (pgss_enabled(exec_nested_level) && queryDesc->plannedstmt->queryId != UINT64CONST(0))
980 : {
981 : /*
982 : * Set up to track total elapsed time in ExecutorRun. Make sure the
983 : * space is allocated in the per-query context so it will go away at
984 : * ExecutorEnd.
985 : */
986 52900 : if (queryDesc->totaltime == NULL)
987 : {
988 : MemoryContext oldcxt;
989 :
990 52900 : oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
991 52900 : queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
992 52900 : MemoryContextSwitchTo(oldcxt);
993 : }
994 : }
995 86400 : }
996 :
997 : /*
998 : * ExecutorRun hook: all we need do is track nesting depth
999 : */
1000 : static void
1001 84882 : pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count,
1002 : bool execute_once)
1003 : {
1004 84882 : exec_nested_level++;
1005 84882 : PG_TRY();
1006 : {
1007 84882 : if (prev_ExecutorRun)
1008 0 : prev_ExecutorRun(queryDesc, direction, count, execute_once);
1009 : else
1010 84882 : standard_ExecutorRun(queryDesc, direction, count, execute_once);
1011 : }
1012 6222 : PG_FINALLY();
1013 : {
1014 84882 : exec_nested_level--;
1015 : }
1016 84882 : PG_END_TRY();
1017 78660 : }
1018 :
1019 : /*
1020 : * ExecutorFinish hook: all we need do is track nesting depth
1021 : */
1022 : static void
1023 75402 : pgss_ExecutorFinish(QueryDesc *queryDesc)
1024 : {
1025 75402 : exec_nested_level++;
1026 75402 : PG_TRY();
1027 : {
1028 75402 : if (prev_ExecutorFinish)
1029 0 : prev_ExecutorFinish(queryDesc);
1030 : else
1031 75402 : standard_ExecutorFinish(queryDesc);
1032 : }
1033 262 : PG_FINALLY();
1034 : {
1035 75402 : exec_nested_level--;
1036 : }
1037 75402 : PG_END_TRY();
1038 75140 : }
1039 :
1040 : /*
1041 : * ExecutorEnd hook: store results if needed
1042 : */
1043 : static void
1044 79836 : pgss_ExecutorEnd(QueryDesc *queryDesc)
1045 : {
1046 79836 : uint64 queryId = queryDesc->plannedstmt->queryId;
1047 :
1048 79836 : if (queryId != UINT64CONST(0) && queryDesc->totaltime &&
1049 50734 : pgss_enabled(exec_nested_level))
1050 : {
1051 : /*
1052 : * Make sure stats accumulation is done. (Note: it's okay if several
1053 : * levels of hook all do this.)
1054 : */
1055 50734 : InstrEndLoop(queryDesc->totaltime);
1056 :
1057 50542 : pgss_store(queryDesc->sourceText,
1058 : queryId,
1059 50734 : queryDesc->plannedstmt->stmt_location,
1060 50734 : queryDesc->plannedstmt->stmt_len,
1061 : PGSS_EXEC,
1062 50734 : queryDesc->totaltime->total * 1000.0, /* convert to msec */
1063 50734 : queryDesc->estate->es_total_processed,
1064 50734 : &queryDesc->totaltime->bufusage,
1065 50734 : &queryDesc->totaltime->walusage,
1066 50734 : queryDesc->estate->es_jit ? &queryDesc->estate->es_jit->instr : NULL,
1067 : NULL);
1068 : }
1069 :
1070 79836 : if (prev_ExecutorEnd)
1071 0 : prev_ExecutorEnd(queryDesc);
1072 : else
1073 79836 : standard_ExecutorEnd(queryDesc);
1074 79836 : }
1075 :
1076 : /*
1077 : * ProcessUtility hook
1078 : */
1079 : static void
1080 54504 : pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
1081 : bool readOnlyTree,
1082 : ProcessUtilityContext context,
1083 : ParamListInfo params, QueryEnvironment *queryEnv,
1084 : DestReceiver *dest, QueryCompletion *qc)
1085 : {
1086 54504 : Node *parsetree = pstmt->utilityStmt;
1087 54504 : uint64 saved_queryId = pstmt->queryId;
1088 54504 : int saved_stmt_location = pstmt->stmt_location;
1089 54504 : int saved_stmt_len = pstmt->stmt_len;
1090 :
1091 : /*
1092 : * Force utility statements to get queryId zero. We do this even in cases
1093 : * where the statement contains an optimizable statement for which a
1094 : * queryId could be derived (such as EXPLAIN or DECLARE CURSOR). For such
1095 : * cases, runtime control will first go through ProcessUtility and then
1096 : * the executor, and we don't want the executor hooks to do anything,
1097 : * since we are already measuring the statement's costs at the utility
1098 : * level.
1099 : *
1100 : * Note that this is only done if pg_stat_statements is enabled and
1101 : * configured to track utility statements, in the unlikely possibility
1102 : * that user configured another extension to handle utility statements
1103 : * only.
1104 : */
1105 54504 : if (pgss_enabled(exec_nested_level) && pgss_track_utility)
1106 45842 : pstmt->queryId = UINT64CONST(0);
1107 :
1108 : /*
1109 : * If it's an EXECUTE statement, we don't track it and don't increment the
1110 : * nesting level. This allows the cycles to be charged to the underlying
1111 : * PREPARE instead (by the Executor hooks), which is much more useful.
1112 : *
1113 : * We also don't track execution of PREPARE. If we did, we would get one
1114 : * hash table entry for the PREPARE (with hash calculated from the query
1115 : * string), and then a different one with the same query string (but hash
1116 : * calculated from the query tree) would be used to accumulate costs of
1117 : * ensuing EXECUTEs. This would be confusing, and inconsistent with other
1118 : * cases where planning time is not included at all.
1119 : *
1120 : * Likewise, we don't track execution of DEALLOCATE.
1121 : */
1122 54504 : if (pgss_track_utility && pgss_enabled(exec_nested_level) &&
1123 45842 : PGSS_HANDLED_UTILITY(parsetree))
1124 38884 : {
1125 : instr_time start;
1126 : instr_time duration;
1127 : uint64 rows;
1128 : BufferUsage bufusage_start,
1129 : bufusage;
1130 : WalUsage walusage_start,
1131 : walusage;
1132 :
1133 42876 : bufusage_start = pgBufferUsage;
1134 42876 : walusage_start = pgWalUsage;
1135 42876 : INSTR_TIME_SET_CURRENT(start);
1136 :
1137 42876 : exec_nested_level++;
1138 42876 : PG_TRY();
1139 : {
1140 42876 : if (prev_ProcessUtility)
1141 0 : prev_ProcessUtility(pstmt, queryString, readOnlyTree,
1142 : context, params, queryEnv,
1143 : dest, qc);
1144 : else
1145 42876 : standard_ProcessUtility(pstmt, queryString, readOnlyTree,
1146 : context, params, queryEnv,
1147 : dest, qc);
1148 : }
1149 3992 : PG_FINALLY();
1150 : {
1151 42876 : exec_nested_level--;
1152 : }
1153 42876 : PG_END_TRY();
1154 :
1155 : /*
1156 : * CAUTION: do not access the *pstmt data structure again below here.
1157 : * If it was a ROLLBACK or similar, that data structure may have been
1158 : * freed. We must copy everything we still need into local variables,
1159 : * which we did above.
1160 : *
1161 : * For the same reason, we can't risk restoring pstmt->queryId to its
1162 : * former value, which'd otherwise be a good idea.
1163 : */
1164 :
1165 38884 : INSTR_TIME_SET_CURRENT(duration);
1166 38884 : INSTR_TIME_SUBTRACT(duration, start);
1167 :
1168 : /*
1169 : * Track the total number of rows retrieved or affected by the utility
1170 : * statements of COPY, FETCH, CREATE TABLE AS, CREATE MATERIALIZED
1171 : * VIEW, REFRESH MATERIALIZED VIEW and SELECT INTO.
1172 : */
1173 38884 : rows = (qc && (qc->commandTag == CMDTAG_COPY ||
1174 36338 : qc->commandTag == CMDTAG_FETCH ||
1175 35824 : qc->commandTag == CMDTAG_SELECT ||
1176 35480 : qc->commandTag == CMDTAG_REFRESH_MATERIALIZED_VIEW)) ?
1177 77768 : qc->nprocessed : 0;
1178 :
1179 : /* calc differences of buffer counters. */
1180 38884 : memset(&bufusage, 0, sizeof(BufferUsage));
1181 38884 : BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
1182 :
1183 : /* calc differences of WAL counters. */
1184 38884 : memset(&walusage, 0, sizeof(WalUsage));
1185 38884 : WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
1186 :
1187 38884 : pgss_store(queryString,
1188 : saved_queryId,
1189 : saved_stmt_location,
1190 : saved_stmt_len,
1191 : PGSS_EXEC,
1192 38884 : INSTR_TIME_GET_MILLISEC(duration),
1193 : rows,
1194 : &bufusage,
1195 : &walusage,
1196 : NULL,
1197 : NULL);
1198 : }
1199 : else
1200 : {
1201 11628 : if (prev_ProcessUtility)
1202 0 : prev_ProcessUtility(pstmt, queryString, readOnlyTree,
1203 : context, params, queryEnv,
1204 : dest, qc);
1205 : else
1206 11628 : standard_ProcessUtility(pstmt, queryString, readOnlyTree,
1207 : context, params, queryEnv,
1208 : dest, qc);
1209 : }
1210 50320 : }
1211 :
1212 : /*
1213 : * Store some statistics for a statement.
1214 : *
1215 : * If jstate is not NULL then we're trying to create an entry for which
1216 : * we have no statistics as yet; we just want to record the normalized
1217 : * query string. total_time, rows, bufusage and walusage are ignored in this
1218 : * case.
1219 : *
1220 : * If kind is PGSS_PLAN or PGSS_EXEC, its value is used as the array position
1221 : * for the arrays in the Counters field.
1222 : */
1223 : static void
1224 136580 : pgss_store(const char *query, uint64 queryId,
1225 : int query_location, int query_len,
1226 : pgssStoreKind kind,
1227 : double total_time, uint64 rows,
1228 : const BufferUsage *bufusage,
1229 : const WalUsage *walusage,
1230 : const struct JitInstrumentation *jitusage,
1231 : JumbleState *jstate)
1232 : {
1233 : pgssHashKey key;
1234 : pgssEntry *entry;
1235 136580 : char *norm_query = NULL;
1236 136580 : int encoding = GetDatabaseEncoding();
1237 :
1238 : Assert(query != NULL);
1239 :
1240 : /* Safety check... */
1241 136580 : if (!pgss || !pgss_hash)
1242 0 : return;
1243 :
1244 : /*
1245 : * Nothing to do if compute_query_id isn't enabled and no other module
1246 : * computed a query identifier.
1247 : */
1248 136580 : if (queryId == UINT64CONST(0))
1249 0 : return;
1250 :
1251 : /*
1252 : * Confine our attention to the relevant part of the string, if the query
1253 : * is a portion of a multi-statement source string, and update query
1254 : * location and length if needed.
1255 : */
1256 136580 : query = CleanQuerytext(query, &query_location, &query_len);
1257 :
1258 : /* Set up key for hashtable search */
1259 :
1260 : /* memset() is required when pgssHashKey is without padding only */
1261 136580 : memset(&key, 0, sizeof(pgssHashKey));
1262 :
1263 136580 : key.userid = GetUserId();
1264 136580 : key.dbid = MyDatabaseId;
1265 136580 : key.queryid = queryId;
1266 136580 : key.toplevel = (exec_nested_level == 0);
1267 :
1268 : /* Lookup the hash table entry with shared lock. */
1269 136580 : LWLockAcquire(pgss->lock, LW_SHARED);
1270 :
1271 136580 : entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_FIND, NULL);
1272 :
1273 : /* Create new entry, if not present */
1274 136580 : if (!entry)
1275 : {
1276 : Size query_offset;
1277 : int gc_count;
1278 : bool stored;
1279 : bool do_gc;
1280 :
1281 : /*
1282 : * Create a new, normalized query string if caller asked. We don't
1283 : * need to hold the lock while doing this work. (Note: in any case,
1284 : * it's possible that someone else creates a duplicate hashtable entry
1285 : * in the interval where we don't hold the lock below. That case is
1286 : * handled by entry_alloc.)
1287 : */
1288 49400 : if (jstate)
1289 : {
1290 17604 : LWLockRelease(pgss->lock);
1291 17604 : norm_query = generate_normalized_query(jstate, query,
1292 : query_location,
1293 : &query_len);
1294 17604 : LWLockAcquire(pgss->lock, LW_SHARED);
1295 : }
1296 :
1297 : /* Append new query text to file with only shared lock held */
1298 49400 : stored = qtext_store(norm_query ? norm_query : query, query_len,
1299 : &query_offset, &gc_count);
1300 :
1301 : /*
1302 : * Determine whether we need to garbage collect external query texts
1303 : * while the shared lock is still held. This micro-optimization
1304 : * avoids taking the time to decide this while holding exclusive lock.
1305 : */
1306 49400 : do_gc = need_gc_qtexts();
1307 :
1308 : /* Need exclusive lock to make a new hashtable entry - promote */
1309 49400 : LWLockRelease(pgss->lock);
1310 49400 : LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
1311 :
1312 : /*
1313 : * A garbage collection may have occurred while we weren't holding the
1314 : * lock. In the unlikely event that this happens, the query text we
1315 : * stored above will have been garbage collected, so write it again.
1316 : * This should be infrequent enough that doing it while holding
1317 : * exclusive lock isn't a performance problem.
1318 : */
1319 49400 : if (!stored || pgss->gc_count != gc_count)
1320 0 : stored = qtext_store(norm_query ? norm_query : query, query_len,
1321 : &query_offset, NULL);
1322 :
1323 : /* If we failed to write to the text file, give up */
1324 49400 : if (!stored)
1325 0 : goto done;
1326 :
1327 : /* OK to create a new hashtable entry */
1328 49400 : entry = entry_alloc(&key, query_offset, query_len, encoding,
1329 : jstate != NULL);
1330 :
1331 : /* If needed, perform garbage collection while exclusive lock held */
1332 49400 : if (do_gc)
1333 0 : gc_qtexts();
1334 : }
1335 :
1336 : /* Increment the counts, except when jstate is not NULL */
1337 136580 : if (!jstate)
1338 : {
1339 : /*
1340 : * Grab the spinlock while updating the counters (see comment about
1341 : * locking rules at the head of the file)
1342 : */
1343 89734 : volatile pgssEntry *e = (volatile pgssEntry *) entry;
1344 :
1345 : Assert(kind == PGSS_PLAN || kind == PGSS_EXEC);
1346 :
1347 89734 : SpinLockAcquire(&e->mutex);
1348 :
1349 : /* "Unstick" entry if it was previously sticky */
1350 89734 : if (IS_STICKY(e->counters))
1351 48278 : e->counters.usage = USAGE_INIT;
1352 :
1353 89734 : e->counters.calls[kind] += 1;
1354 89734 : e->counters.total_time[kind] += total_time;
1355 :
1356 89734 : if (e->counters.calls[kind] == 1)
1357 : {
1358 48362 : e->counters.min_time[kind] = total_time;
1359 48362 : e->counters.max_time[kind] = total_time;
1360 48362 : e->counters.mean_time[kind] = total_time;
1361 : }
1362 : else
1363 : {
1364 : /*
1365 : * Welford's method for accurately computing variance. See
1366 : * <http://www.johndcook.com/blog/standard_deviation/>
1367 : */
1368 41372 : double old_mean = e->counters.mean_time[kind];
1369 :
1370 41372 : e->counters.mean_time[kind] +=
1371 41372 : (total_time - old_mean) / e->counters.calls[kind];
1372 41372 : e->counters.sum_var_time[kind] +=
1373 41372 : (total_time - old_mean) * (total_time - e->counters.mean_time[kind]);
1374 :
1375 : /* calculate min and max time */
1376 41372 : if (e->counters.min_time[kind] > total_time)
1377 11144 : e->counters.min_time[kind] = total_time;
1378 41372 : if (e->counters.max_time[kind] < total_time)
1379 4766 : e->counters.max_time[kind] = total_time;
1380 : }
1381 89734 : e->counters.rows += rows;
1382 89734 : e->counters.shared_blks_hit += bufusage->shared_blks_hit;
1383 89734 : e->counters.shared_blks_read += bufusage->shared_blks_read;
1384 89734 : e->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied;
1385 89734 : e->counters.shared_blks_written += bufusage->shared_blks_written;
1386 89734 : e->counters.local_blks_hit += bufusage->local_blks_hit;
1387 89734 : e->counters.local_blks_read += bufusage->local_blks_read;
1388 89734 : e->counters.local_blks_dirtied += bufusage->local_blks_dirtied;
1389 89734 : e->counters.local_blks_written += bufusage->local_blks_written;
1390 89734 : e->counters.temp_blks_read += bufusage->temp_blks_read;
1391 89734 : e->counters.temp_blks_written += bufusage->temp_blks_written;
1392 89734 : e->counters.blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_read_time);
1393 89734 : e->counters.blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_write_time);
1394 89734 : e->counters.temp_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_read_time);
1395 89734 : e->counters.temp_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_write_time);
1396 89734 : e->counters.usage += USAGE_EXEC(total_time);
1397 89734 : e->counters.wal_records += walusage->wal_records;
1398 89734 : e->counters.wal_fpi += walusage->wal_fpi;
1399 89734 : e->counters.wal_bytes += walusage->wal_bytes;
1400 89734 : if (jitusage)
1401 : {
1402 192 : e->counters.jit_functions += jitusage->created_functions;
1403 192 : e->counters.jit_generation_time += INSTR_TIME_GET_MILLISEC(jitusage->generation_counter);
1404 :
1405 192 : if (INSTR_TIME_GET_MILLISEC(jitusage->inlining_counter))
1406 132 : e->counters.jit_inlining_count++;
1407 192 : e->counters.jit_inlining_time += INSTR_TIME_GET_MILLISEC(jitusage->inlining_counter);
1408 :
1409 192 : if (INSTR_TIME_GET_MILLISEC(jitusage->optimization_counter))
1410 188 : e->counters.jit_optimization_count++;
1411 192 : e->counters.jit_optimization_time += INSTR_TIME_GET_MILLISEC(jitusage->optimization_counter);
1412 :
1413 192 : if (INSTR_TIME_GET_MILLISEC(jitusage->emission_counter))
1414 188 : e->counters.jit_emission_count++;
1415 192 : e->counters.jit_emission_time += INSTR_TIME_GET_MILLISEC(jitusage->emission_counter);
1416 : }
1417 :
1418 89734 : SpinLockRelease(&e->mutex);
1419 : }
1420 :
1421 46846 : done:
1422 136580 : LWLockRelease(pgss->lock);
1423 :
1424 : /* We postpone this clean-up until we're out of the lock */
1425 136580 : if (norm_query)
1426 17604 : pfree(norm_query);
1427 : }
1428 :
1429 : /*
1430 : * Reset statement statistics corresponding to userid, dbid, and queryid.
1431 : */
1432 : Datum
1433 80 : pg_stat_statements_reset_1_7(PG_FUNCTION_ARGS)
1434 : {
1435 : Oid userid;
1436 : Oid dbid;
1437 : uint64 queryid;
1438 :
1439 80 : userid = PG_GETARG_OID(0);
1440 80 : dbid = PG_GETARG_OID(1);
1441 80 : queryid = (uint64) PG_GETARG_INT64(2);
1442 :
1443 80 : entry_reset(userid, dbid, queryid);
1444 :
1445 80 : PG_RETURN_VOID();
1446 : }
1447 :
1448 : /*
1449 : * Reset statement statistics.
1450 : */
1451 : Datum
1452 2 : pg_stat_statements_reset(PG_FUNCTION_ARGS)
1453 : {
1454 2 : entry_reset(0, 0, 0);
1455 :
1456 2 : PG_RETURN_VOID();
1457 : }
1458 :
1459 : /* Number of output arguments (columns) for various API versions */
1460 : #define PG_STAT_STATEMENTS_COLS_V1_0 14
1461 : #define PG_STAT_STATEMENTS_COLS_V1_1 18
1462 : #define PG_STAT_STATEMENTS_COLS_V1_2 19
1463 : #define PG_STAT_STATEMENTS_COLS_V1_3 23
1464 : #define PG_STAT_STATEMENTS_COLS_V1_8 32
1465 : #define PG_STAT_STATEMENTS_COLS_V1_9 33
1466 : #define PG_STAT_STATEMENTS_COLS_V1_10 43
1467 : #define PG_STAT_STATEMENTS_COLS 43 /* maximum of above */
1468 :
1469 : /*
1470 : * Retrieve statement statistics.
1471 : *
1472 : * The SQL API of this function has changed multiple times, and will likely
1473 : * do so again in future. To support the case where a newer version of this
1474 : * loadable module is being used with an old SQL declaration of the function,
1475 : * we continue to support the older API versions. For 1.2 and later, the
1476 : * expected API version is identified by embedding it in the C name of the
1477 : * function. Unfortunately we weren't bright enough to do that for 1.1.
1478 : */
1479 : Datum
1480 82 : pg_stat_statements_1_10(PG_FUNCTION_ARGS)
1481 : {
1482 82 : bool showtext = PG_GETARG_BOOL(0);
1483 :
1484 82 : pg_stat_statements_internal(fcinfo, PGSS_V1_10, showtext);
1485 :
1486 82 : return (Datum) 0;
1487 : }
1488 :
1489 : Datum
1490 2 : pg_stat_statements_1_9(PG_FUNCTION_ARGS)
1491 : {
1492 2 : bool showtext = PG_GETARG_BOOL(0);
1493 :
1494 2 : pg_stat_statements_internal(fcinfo, PGSS_V1_9, showtext);
1495 :
1496 2 : return (Datum) 0;
1497 : }
1498 :
1499 : Datum
1500 0 : pg_stat_statements_1_8(PG_FUNCTION_ARGS)
1501 : {
1502 0 : bool showtext = PG_GETARG_BOOL(0);
1503 :
1504 0 : pg_stat_statements_internal(fcinfo, PGSS_V1_8, showtext);
1505 :
1506 0 : return (Datum) 0;
1507 : }
1508 :
1509 : Datum
1510 2 : pg_stat_statements_1_3(PG_FUNCTION_ARGS)
1511 : {
1512 2 : bool showtext = PG_GETARG_BOOL(0);
1513 :
1514 2 : pg_stat_statements_internal(fcinfo, PGSS_V1_3, showtext);
1515 :
1516 2 : return (Datum) 0;
1517 : }
1518 :
1519 : Datum
1520 0 : pg_stat_statements_1_2(PG_FUNCTION_ARGS)
1521 : {
1522 0 : bool showtext = PG_GETARG_BOOL(0);
1523 :
1524 0 : pg_stat_statements_internal(fcinfo, PGSS_V1_2, showtext);
1525 :
1526 0 : return (Datum) 0;
1527 : }
1528 :
1529 : /*
1530 : * Legacy entry point for pg_stat_statements() API versions 1.0 and 1.1.
1531 : * This can be removed someday, perhaps.
1532 : */
1533 : Datum
1534 0 : pg_stat_statements(PG_FUNCTION_ARGS)
1535 : {
1536 : /* If it's really API 1.1, we'll figure that out below */
1537 0 : pg_stat_statements_internal(fcinfo, PGSS_V1_0, true);
1538 :
1539 0 : return (Datum) 0;
1540 : }
1541 :
1542 : /* Common code for all versions of pg_stat_statements() */
1543 : static void
1544 86 : pg_stat_statements_internal(FunctionCallInfo fcinfo,
1545 : pgssVersion api_version,
1546 : bool showtext)
1547 : {
1548 86 : ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
1549 86 : Oid userid = GetUserId();
1550 86 : bool is_allowed_role = false;
1551 86 : char *qbuffer = NULL;
1552 86 : Size qbuffer_size = 0;
1553 86 : Size extent = 0;
1554 86 : int gc_count = 0;
1555 : HASH_SEQ_STATUS hash_seq;
1556 : pgssEntry *entry;
1557 :
1558 : /*
1559 : * Superusers or roles with the privileges of pg_read_all_stats members
1560 : * are allowed
1561 : */
1562 86 : is_allowed_role = has_privs_of_role(userid, ROLE_PG_READ_ALL_STATS);
1563 :
1564 : /* hash table must exist already */
1565 86 : if (!pgss || !pgss_hash)
1566 0 : ereport(ERROR,
1567 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1568 : errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
1569 :
1570 86 : InitMaterializedSRF(fcinfo, 0);
1571 :
1572 : /*
1573 : * Check we have the expected number of output arguments. Aside from
1574 : * being a good safety check, we need a kluge here to detect API version
1575 : * 1.1, which was wedged into the code in an ill-considered way.
1576 : */
1577 86 : switch (rsinfo->setDesc->natts)
1578 : {
1579 0 : case PG_STAT_STATEMENTS_COLS_V1_0:
1580 0 : if (api_version != PGSS_V1_0)
1581 0 : elog(ERROR, "incorrect number of output arguments");
1582 0 : break;
1583 0 : case PG_STAT_STATEMENTS_COLS_V1_1:
1584 : /* pg_stat_statements() should have told us 1.0 */
1585 0 : if (api_version != PGSS_V1_0)
1586 0 : elog(ERROR, "incorrect number of output arguments");
1587 0 : api_version = PGSS_V1_1;
1588 0 : break;
1589 0 : case PG_STAT_STATEMENTS_COLS_V1_2:
1590 0 : if (api_version != PGSS_V1_2)
1591 0 : elog(ERROR, "incorrect number of output arguments");
1592 0 : break;
1593 2 : case PG_STAT_STATEMENTS_COLS_V1_3:
1594 2 : if (api_version != PGSS_V1_3)
1595 0 : elog(ERROR, "incorrect number of output arguments");
1596 2 : break;
1597 0 : case PG_STAT_STATEMENTS_COLS_V1_8:
1598 0 : if (api_version != PGSS_V1_8)
1599 0 : elog(ERROR, "incorrect number of output arguments");
1600 0 : break;
1601 2 : case PG_STAT_STATEMENTS_COLS_V1_9:
1602 2 : if (api_version != PGSS_V1_9)
1603 0 : elog(ERROR, "incorrect number of output arguments");
1604 2 : break;
1605 82 : case PG_STAT_STATEMENTS_COLS_V1_10:
1606 82 : if (api_version != PGSS_V1_10)
1607 0 : elog(ERROR, "incorrect number of output arguments");
1608 82 : break;
1609 0 : default:
1610 0 : elog(ERROR, "incorrect number of output arguments");
1611 : }
1612 :
1613 : /*
1614 : * We'd like to load the query text file (if needed) while not holding any
1615 : * lock on pgss->lock. In the worst case we'll have to do this again
1616 : * after we have the lock, but it's unlikely enough to make this a win
1617 : * despite occasional duplicated work. We need to reload if anybody
1618 : * writes to the file (either a retail qtext_store(), or a garbage
1619 : * collection) between this point and where we've gotten shared lock. If
1620 : * a qtext_store is actually in progress when we look, we might as well
1621 : * skip the speculative load entirely.
1622 : */
1623 86 : if (showtext)
1624 : {
1625 : int n_writers;
1626 :
1627 : /* Take the mutex so we can examine variables */
1628 : {
1629 86 : volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
1630 :
1631 86 : SpinLockAcquire(&s->mutex);
1632 86 : extent = s->extent;
1633 86 : n_writers = s->n_writers;
1634 86 : gc_count = s->gc_count;
1635 86 : SpinLockRelease(&s->mutex);
1636 : }
1637 :
1638 : /* No point in loading file now if there are active writers */
1639 86 : if (n_writers == 0)
1640 86 : qbuffer = qtext_load_file(&qbuffer_size);
1641 : }
1642 :
1643 : /*
1644 : * Get shared lock, load or reload the query text file if we must, and
1645 : * iterate over the hashtable entries.
1646 : *
1647 : * With a large hash table, we might be holding the lock rather longer
1648 : * than one could wish. However, this only blocks creation of new hash
1649 : * table entries, and the larger the hash table the less likely that is to
1650 : * be needed. So we can hope this is okay. Perhaps someday we'll decide
1651 : * we need to partition the hash table to limit the time spent holding any
1652 : * one lock.
1653 : */
1654 86 : LWLockAcquire(pgss->lock, LW_SHARED);
1655 :
1656 86 : if (showtext)
1657 : {
1658 : /*
1659 : * Here it is safe to examine extent and gc_count without taking the
1660 : * mutex. Note that although other processes might change
1661 : * pgss->extent just after we look at it, the strings they then write
1662 : * into the file cannot yet be referenced in the hashtable, so we
1663 : * don't care whether we see them or not.
1664 : *
1665 : * If qtext_load_file fails, we just press on; we'll return NULL for
1666 : * every query text.
1667 : */
1668 86 : if (qbuffer == NULL ||
1669 86 : pgss->extent != extent ||
1670 86 : pgss->gc_count != gc_count)
1671 : {
1672 0 : free(qbuffer);
1673 0 : qbuffer = qtext_load_file(&qbuffer_size);
1674 : }
1675 : }
1676 :
1677 86 : hash_seq_init(&hash_seq, pgss_hash);
1678 48018 : while ((entry = hash_seq_search(&hash_seq)) != NULL)
1679 : {
1680 : Datum values[PG_STAT_STATEMENTS_COLS];
1681 : bool nulls[PG_STAT_STATEMENTS_COLS];
1682 47932 : int i = 0;
1683 : Counters tmp;
1684 : double stddev;
1685 47932 : int64 queryid = entry->key.queryid;
1686 :
1687 47932 : memset(values, 0, sizeof(values));
1688 47932 : memset(nulls, 0, sizeof(nulls));
1689 :
1690 47932 : values[i++] = ObjectIdGetDatum(entry->key.userid);
1691 47932 : values[i++] = ObjectIdGetDatum(entry->key.dbid);
1692 47932 : if (api_version >= PGSS_V1_9)
1693 47912 : values[i++] = BoolGetDatum(entry->key.toplevel);
1694 :
1695 47932 : if (is_allowed_role || entry->key.userid == userid)
1696 : {
1697 47932 : if (api_version >= PGSS_V1_2)
1698 47932 : values[i++] = Int64GetDatumFast(queryid);
1699 :
1700 47932 : if (showtext)
1701 : {
1702 47932 : char *qstr = qtext_fetch(entry->query_offset,
1703 : entry->query_len,
1704 : qbuffer,
1705 : qbuffer_size);
1706 :
1707 47932 : if (qstr)
1708 : {
1709 : char *enc;
1710 :
1711 47932 : enc = pg_any_to_server(qstr,
1712 : entry->query_len,
1713 : entry->encoding);
1714 :
1715 47932 : values[i++] = CStringGetTextDatum(enc);
1716 :
1717 47932 : if (enc != qstr)
1718 0 : pfree(enc);
1719 : }
1720 : else
1721 : {
1722 : /* Just return a null if we fail to find the text */
1723 0 : nulls[i++] = true;
1724 : }
1725 : }
1726 : else
1727 : {
1728 : /* Query text not requested */
1729 0 : nulls[i++] = true;
1730 : }
1731 : }
1732 : else
1733 : {
1734 : /* Don't show queryid */
1735 0 : if (api_version >= PGSS_V1_2)
1736 0 : nulls[i++] = true;
1737 :
1738 : /*
1739 : * Don't show query text, but hint as to the reason for not doing
1740 : * so if it was requested
1741 : */
1742 0 : if (showtext)
1743 0 : values[i++] = CStringGetTextDatum("<insufficient privilege>");
1744 : else
1745 0 : nulls[i++] = true;
1746 : }
1747 :
1748 : /* copy counters to a local variable to keep locking time short */
1749 : {
1750 47932 : volatile pgssEntry *e = (volatile pgssEntry *) entry;
1751 :
1752 47932 : SpinLockAcquire(&e->mutex);
1753 47932 : tmp = e->counters;
1754 47932 : SpinLockRelease(&e->mutex);
1755 : }
1756 :
1757 : /* Skip entry if unexecuted (ie, it's a pending "sticky" entry) */
1758 47932 : if (IS_STICKY(tmp))
1759 1140 : continue;
1760 :
1761 : /* Note that we rely on PGSS_PLAN being 0 and PGSS_EXEC being 1. */
1762 140376 : for (int kind = 0; kind < PGSS_NUMKIND; kind++)
1763 : {
1764 93584 : if (kind == PGSS_EXEC || api_version >= PGSS_V1_8)
1765 : {
1766 93566 : values[i++] = Int64GetDatumFast(tmp.calls[kind]);
1767 93566 : values[i++] = Float8GetDatumFast(tmp.total_time[kind]);
1768 : }
1769 :
1770 93584 : if ((kind == PGSS_EXEC && api_version >= PGSS_V1_3) ||
1771 : api_version >= PGSS_V1_8)
1772 : {
1773 93566 : values[i++] = Float8GetDatumFast(tmp.min_time[kind]);
1774 93566 : values[i++] = Float8GetDatumFast(tmp.max_time[kind]);
1775 93566 : values[i++] = Float8GetDatumFast(tmp.mean_time[kind]);
1776 :
1777 : /*
1778 : * Note we are calculating the population variance here, not
1779 : * the sample variance, as we have data for the whole
1780 : * population, so Bessel's correction is not used, and we
1781 : * don't divide by tmp.calls - 1.
1782 : */
1783 93566 : if (tmp.calls[kind] > 1)
1784 8228 : stddev = sqrt(tmp.sum_var_time[kind] / tmp.calls[kind]);
1785 : else
1786 85338 : stddev = 0.0;
1787 93566 : values[i++] = Float8GetDatumFast(stddev);
1788 : }
1789 : }
1790 46792 : values[i++] = Int64GetDatumFast(tmp.rows);
1791 46792 : values[i++] = Int64GetDatumFast(tmp.shared_blks_hit);
1792 46792 : values[i++] = Int64GetDatumFast(tmp.shared_blks_read);
1793 46792 : if (api_version >= PGSS_V1_1)
1794 46792 : values[i++] = Int64GetDatumFast(tmp.shared_blks_dirtied);
1795 46792 : values[i++] = Int64GetDatumFast(tmp.shared_blks_written);
1796 46792 : values[i++] = Int64GetDatumFast(tmp.local_blks_hit);
1797 46792 : values[i++] = Int64GetDatumFast(tmp.local_blks_read);
1798 46792 : if (api_version >= PGSS_V1_1)
1799 46792 : values[i++] = Int64GetDatumFast(tmp.local_blks_dirtied);
1800 46792 : values[i++] = Int64GetDatumFast(tmp.local_blks_written);
1801 46792 : values[i++] = Int64GetDatumFast(tmp.temp_blks_read);
1802 46792 : values[i++] = Int64GetDatumFast(tmp.temp_blks_written);
1803 46792 : if (api_version >= PGSS_V1_1)
1804 : {
1805 46792 : values[i++] = Float8GetDatumFast(tmp.blk_read_time);
1806 46792 : values[i++] = Float8GetDatumFast(tmp.blk_write_time);
1807 : }
1808 46792 : if (api_version >= PGSS_V1_10)
1809 : {
1810 46750 : values[i++] = Float8GetDatumFast(tmp.temp_blk_read_time);
1811 46750 : values[i++] = Float8GetDatumFast(tmp.temp_blk_write_time);
1812 : }
1813 46792 : if (api_version >= PGSS_V1_8)
1814 : {
1815 : char buf[256];
1816 : Datum wal_bytes;
1817 :
1818 46774 : values[i++] = Int64GetDatumFast(tmp.wal_records);
1819 46774 : values[i++] = Int64GetDatumFast(tmp.wal_fpi);
1820 :
1821 46774 : snprintf(buf, sizeof buf, UINT64_FORMAT, tmp.wal_bytes);
1822 :
1823 : /* Convert to numeric. */
1824 46774 : wal_bytes = DirectFunctionCall3(numeric_in,
1825 : CStringGetDatum(buf),
1826 : ObjectIdGetDatum(0),
1827 : Int32GetDatum(-1));
1828 46774 : values[i++] = wal_bytes;
1829 : }
1830 46792 : if (api_version >= PGSS_V1_10)
1831 : {
1832 46750 : values[i++] = Int64GetDatumFast(tmp.jit_functions);
1833 46750 : values[i++] = Float8GetDatumFast(tmp.jit_generation_time);
1834 46750 : values[i++] = Int64GetDatumFast(tmp.jit_inlining_count);
1835 46750 : values[i++] = Float8GetDatumFast(tmp.jit_inlining_time);
1836 46750 : values[i++] = Int64GetDatumFast(tmp.jit_optimization_count);
1837 46750 : values[i++] = Float8GetDatumFast(tmp.jit_optimization_time);
1838 46750 : values[i++] = Int64GetDatumFast(tmp.jit_emission_count);
1839 46750 : values[i++] = Float8GetDatumFast(tmp.jit_emission_time);
1840 : }
1841 :
1842 : Assert(i == (api_version == PGSS_V1_0 ? PG_STAT_STATEMENTS_COLS_V1_0 :
1843 : api_version == PGSS_V1_1 ? PG_STAT_STATEMENTS_COLS_V1_1 :
1844 : api_version == PGSS_V1_2 ? PG_STAT_STATEMENTS_COLS_V1_2 :
1845 : api_version == PGSS_V1_3 ? PG_STAT_STATEMENTS_COLS_V1_3 :
1846 : api_version == PGSS_V1_8 ? PG_STAT_STATEMENTS_COLS_V1_8 :
1847 : api_version == PGSS_V1_9 ? PG_STAT_STATEMENTS_COLS_V1_9 :
1848 : api_version == PGSS_V1_10 ? PG_STAT_STATEMENTS_COLS_V1_10 :
1849 : -1 /* fail if you forget to update this assert */ ));
1850 :
1851 46792 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
1852 : }
1853 :
1854 86 : LWLockRelease(pgss->lock);
1855 :
1856 86 : free(qbuffer);
1857 86 : }
1858 :
1859 : /* Number of output arguments (columns) for pg_stat_statements_info */
1860 : #define PG_STAT_STATEMENTS_INFO_COLS 2
1861 :
1862 : /*
1863 : * Return statistics of pg_stat_statements.
1864 : */
1865 : Datum
1866 2 : pg_stat_statements_info(PG_FUNCTION_ARGS)
1867 : {
1868 : pgssGlobalStats stats;
1869 : TupleDesc tupdesc;
1870 2 : Datum values[PG_STAT_STATEMENTS_INFO_COLS] = {0};
1871 2 : bool nulls[PG_STAT_STATEMENTS_INFO_COLS] = {0};
1872 :
1873 2 : if (!pgss || !pgss_hash)
1874 0 : ereport(ERROR,
1875 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1876 : errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
1877 :
1878 : /* Build a tuple descriptor for our result type */
1879 2 : if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1880 0 : elog(ERROR, "return type must be a row type");
1881 :
1882 : /* Read global statistics for pg_stat_statements */
1883 : {
1884 2 : volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
1885 :
1886 2 : SpinLockAcquire(&s->mutex);
1887 2 : stats = s->stats;
1888 2 : SpinLockRelease(&s->mutex);
1889 : }
1890 :
1891 2 : values[0] = Int64GetDatum(stats.dealloc);
1892 2 : values[1] = TimestampTzGetDatum(stats.stats_reset);
1893 :
1894 2 : PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
1895 : }
1896 :
1897 : /*
1898 : * Estimate shared memory space needed.
1899 : */
1900 : static Size
1901 6 : pgss_memsize(void)
1902 : {
1903 : Size size;
1904 :
1905 6 : size = MAXALIGN(sizeof(pgssSharedState));
1906 6 : size = add_size(size, hash_estimate_size(pgss_max, sizeof(pgssEntry)));
1907 :
1908 6 : return size;
1909 : }
1910 :
1911 : /*
1912 : * Allocate a new hashtable entry.
1913 : * caller must hold an exclusive lock on pgss->lock
1914 : *
1915 : * "query" need not be null-terminated; we rely on query_len instead
1916 : *
1917 : * If "sticky" is true, make the new entry artificially sticky so that it will
1918 : * probably still be there when the query finishes execution. We do this by
1919 : * giving it a median usage value rather than the normal value. (Strictly
1920 : * speaking, query strings are normalized on a best effort basis, though it
1921 : * would be difficult to demonstrate this even under artificial conditions.)
1922 : *
1923 : * Note: despite needing exclusive lock, it's not an error for the target
1924 : * entry to already exist. This is because pgss_store releases and
1925 : * reacquires lock after failing to find a match; so someone else could
1926 : * have made the entry while we waited to get exclusive lock.
1927 : */
1928 : static pgssEntry *
1929 49400 : entry_alloc(pgssHashKey *key, Size query_offset, int query_len, int encoding,
1930 : bool sticky)
1931 : {
1932 : pgssEntry *entry;
1933 : bool found;
1934 :
1935 : /* Make space if needed */
1936 49400 : while (hash_get_num_entries(pgss_hash) >= pgss_max)
1937 0 : entry_dealloc();
1938 :
1939 : /* Find or create an entry with desired hash code */
1940 49400 : entry = (pgssEntry *) hash_search(pgss_hash, key, HASH_ENTER, &found);
1941 :
1942 49400 : if (!found)
1943 : {
1944 : /* New entry, initialize it */
1945 :
1946 : /* reset the statistics */
1947 49400 : memset(&entry->counters, 0, sizeof(Counters));
1948 : /* set the appropriate initial usage count */
1949 49400 : entry->counters.usage = sticky ? pgss->cur_median_usage : USAGE_INIT;
1950 : /* re-initialize the mutex each time ... we assume no one using it */
1951 49400 : SpinLockInit(&entry->mutex);
1952 : /* ... and don't forget the query text metadata */
1953 : Assert(query_len >= 0);
1954 49400 : entry->query_offset = query_offset;
1955 49400 : entry->query_len = query_len;
1956 49400 : entry->encoding = encoding;
1957 : }
1958 :
1959 49400 : return entry;
1960 : }
1961 :
1962 : /*
1963 : * qsort comparator for sorting into increasing usage order
1964 : */
1965 : static int
1966 0 : entry_cmp(const void *lhs, const void *rhs)
1967 : {
1968 0 : double l_usage = (*(pgssEntry *const *) lhs)->counters.usage;
1969 0 : double r_usage = (*(pgssEntry *const *) rhs)->counters.usage;
1970 :
1971 0 : if (l_usage < r_usage)
1972 0 : return -1;
1973 0 : else if (l_usage > r_usage)
1974 0 : return +1;
1975 : else
1976 0 : return 0;
1977 : }
1978 :
1979 : /*
1980 : * Deallocate least-used entries.
1981 : *
1982 : * Caller must hold an exclusive lock on pgss->lock.
1983 : */
1984 : static void
1985 0 : entry_dealloc(void)
1986 : {
1987 : HASH_SEQ_STATUS hash_seq;
1988 : pgssEntry **entries;
1989 : pgssEntry *entry;
1990 : int nvictims;
1991 : int i;
1992 : Size tottextlen;
1993 : int nvalidtexts;
1994 :
1995 : /*
1996 : * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them.
1997 : * While we're scanning the table, apply the decay factor to the usage
1998 : * values, and update the mean query length.
1999 : *
2000 : * Note that the mean query length is almost immediately obsolete, since
2001 : * we compute it before not after discarding the least-used entries.
2002 : * Hopefully, that doesn't affect the mean too much; it doesn't seem worth
2003 : * making two passes to get a more current result. Likewise, the new
2004 : * cur_median_usage includes the entries we're about to zap.
2005 : */
2006 :
2007 0 : entries = palloc(hash_get_num_entries(pgss_hash) * sizeof(pgssEntry *));
2008 :
2009 0 : i = 0;
2010 0 : tottextlen = 0;
2011 0 : nvalidtexts = 0;
2012 :
2013 0 : hash_seq_init(&hash_seq, pgss_hash);
2014 0 : while ((entry = hash_seq_search(&hash_seq)) != NULL)
2015 : {
2016 0 : entries[i++] = entry;
2017 : /* "Sticky" entries get a different usage decay rate. */
2018 0 : if (IS_STICKY(entry->counters))
2019 0 : entry->counters.usage *= STICKY_DECREASE_FACTOR;
2020 : else
2021 0 : entry->counters.usage *= USAGE_DECREASE_FACTOR;
2022 : /* In the mean length computation, ignore dropped texts. */
2023 0 : if (entry->query_len >= 0)
2024 : {
2025 0 : tottextlen += entry->query_len + 1;
2026 0 : nvalidtexts++;
2027 : }
2028 : }
2029 :
2030 : /* Sort into increasing order by usage */
2031 0 : qsort(entries, i, sizeof(pgssEntry *), entry_cmp);
2032 :
2033 : /* Record the (approximate) median usage */
2034 0 : if (i > 0)
2035 0 : pgss->cur_median_usage = entries[i / 2]->counters.usage;
2036 : /* Record the mean query length */
2037 0 : if (nvalidtexts > 0)
2038 0 : pgss->mean_query_len = tottextlen / nvalidtexts;
2039 : else
2040 0 : pgss->mean_query_len = ASSUMED_LENGTH_INIT;
2041 :
2042 : /* Now zap an appropriate fraction of lowest-usage entries */
2043 0 : nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
2044 0 : nvictims = Min(nvictims, i);
2045 :
2046 0 : for (i = 0; i < nvictims; i++)
2047 : {
2048 0 : hash_search(pgss_hash, &entries[i]->key, HASH_REMOVE, NULL);
2049 : }
2050 :
2051 0 : pfree(entries);
2052 :
2053 : /* Increment the number of times entries are deallocated */
2054 : {
2055 0 : volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
2056 :
2057 0 : SpinLockAcquire(&s->mutex);
2058 0 : s->stats.dealloc += 1;
2059 0 : SpinLockRelease(&s->mutex);
2060 : }
2061 0 : }
2062 :
2063 : /*
2064 : * Given a query string (not necessarily null-terminated), allocate a new
2065 : * entry in the external query text file and store the string there.
2066 : *
2067 : * If successful, returns true, and stores the new entry's offset in the file
2068 : * into *query_offset. Also, if gc_count isn't NULL, *gc_count is set to the
2069 : * number of garbage collections that have occurred so far.
2070 : *
2071 : * On failure, returns false.
2072 : *
2073 : * At least a shared lock on pgss->lock must be held by the caller, so as
2074 : * to prevent a concurrent garbage collection. Share-lock-holding callers
2075 : * should pass a gc_count pointer to obtain the number of garbage collections,
2076 : * so that they can recheck the count after obtaining exclusive lock to
2077 : * detect whether a garbage collection occurred (and removed this entry).
2078 : */
2079 : static bool
2080 49400 : qtext_store(const char *query, int query_len,
2081 : Size *query_offset, int *gc_count)
2082 : {
2083 : Size off;
2084 : int fd;
2085 :
2086 : /*
2087 : * We use a spinlock to protect extent/n_writers/gc_count, so that
2088 : * multiple processes may execute this function concurrently.
2089 : */
2090 : {
2091 49400 : volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
2092 :
2093 49400 : SpinLockAcquire(&s->mutex);
2094 49400 : off = s->extent;
2095 49400 : s->extent += query_len + 1;
2096 49400 : s->n_writers++;
2097 49400 : if (gc_count)
2098 49400 : *gc_count = s->gc_count;
2099 49400 : SpinLockRelease(&s->mutex);
2100 : }
2101 :
2102 49400 : *query_offset = off;
2103 :
2104 : /*
2105 : * Don't allow the file to grow larger than what qtext_load_file can
2106 : * (theoretically) handle. This has been seen to be reachable on 32-bit
2107 : * platforms.
2108 : */
2109 49400 : if (unlikely(query_len >= MaxAllocHugeSize - off))
2110 : {
2111 0 : errno = EFBIG; /* not quite right, but it'll do */
2112 0 : fd = -1;
2113 0 : goto error;
2114 : }
2115 :
2116 : /* Now write the data into the successfully-reserved part of the file */
2117 49400 : fd = OpenTransientFile(PGSS_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY);
2118 49400 : if (fd < 0)
2119 0 : goto error;
2120 :
2121 49400 : if (pg_pwrite(fd, query, query_len, off) != query_len)
2122 0 : goto error;
2123 49400 : if (pg_pwrite(fd, "\0", 1, off + query_len) != 1)
2124 0 : goto error;
2125 :
2126 49400 : CloseTransientFile(fd);
2127 :
2128 : /* Mark our write complete */
2129 : {
2130 49400 : volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
2131 :
2132 49400 : SpinLockAcquire(&s->mutex);
2133 49400 : s->n_writers--;
2134 49400 : SpinLockRelease(&s->mutex);
2135 : }
2136 :
2137 49400 : return true;
2138 :
2139 0 : error:
2140 0 : ereport(LOG,
2141 : (errcode_for_file_access(),
2142 : errmsg("could not write file \"%s\": %m",
2143 : PGSS_TEXT_FILE)));
2144 :
2145 0 : if (fd >= 0)
2146 0 : CloseTransientFile(fd);
2147 :
2148 : /* Mark our write complete */
2149 : {
2150 0 : volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
2151 :
2152 0 : SpinLockAcquire(&s->mutex);
2153 0 : s->n_writers--;
2154 0 : SpinLockRelease(&s->mutex);
2155 : }
2156 :
2157 0 : return false;
2158 : }
2159 :
2160 : /*
2161 : * Read the external query text file into a malloc'd buffer.
2162 : *
2163 : * Returns NULL (without throwing an error) if unable to read, eg
2164 : * file not there or insufficient memory.
2165 : *
2166 : * On success, the buffer size is also returned into *buffer_size.
2167 : *
2168 : * This can be called without any lock on pgss->lock, but in that case
2169 : * the caller is responsible for verifying that the result is sane.
2170 : */
2171 : static char *
2172 92 : qtext_load_file(Size *buffer_size)
2173 : {
2174 : char *buf;
2175 : int fd;
2176 : struct stat stat;
2177 : Size nread;
2178 :
2179 92 : fd = OpenTransientFile(PGSS_TEXT_FILE, O_RDONLY | PG_BINARY);
2180 92 : if (fd < 0)
2181 : {
2182 0 : if (errno != ENOENT)
2183 0 : ereport(LOG,
2184 : (errcode_for_file_access(),
2185 : errmsg("could not read file \"%s\": %m",
2186 : PGSS_TEXT_FILE)));
2187 0 : return NULL;
2188 : }
2189 :
2190 : /* Get file length */
2191 92 : if (fstat(fd, &stat))
2192 : {
2193 0 : ereport(LOG,
2194 : (errcode_for_file_access(),
2195 : errmsg("could not stat file \"%s\": %m",
2196 : PGSS_TEXT_FILE)));
2197 0 : CloseTransientFile(fd);
2198 0 : return NULL;
2199 : }
2200 :
2201 : /* Allocate buffer; beware that off_t might be wider than size_t */
2202 92 : if (stat.st_size <= MaxAllocHugeSize)
2203 92 : buf = (char *) malloc(stat.st_size);
2204 : else
2205 0 : buf = NULL;
2206 92 : if (buf == NULL)
2207 : {
2208 0 : ereport(LOG,
2209 : (errcode(ERRCODE_OUT_OF_MEMORY),
2210 : errmsg("out of memory"),
2211 : errdetail("Could not allocate enough memory to read file \"%s\".",
2212 : PGSS_TEXT_FILE)));
2213 0 : CloseTransientFile(fd);
2214 0 : return NULL;
2215 : }
2216 :
2217 : /*
2218 : * OK, slurp in the file. Windows fails if we try to read more than
2219 : * INT_MAX bytes at once, and other platforms might not like that either,
2220 : * so read a very large file in 1GB segments.
2221 : */
2222 92 : nread = 0;
2223 182 : while (nread < stat.st_size)
2224 : {
2225 90 : int toread = Min(1024 * 1024 * 1024, stat.st_size - nread);
2226 :
2227 : /*
2228 : * If we get a short read and errno doesn't get set, the reason is
2229 : * probably that garbage collection truncated the file since we did
2230 : * the fstat(), so we don't log a complaint --- but we don't return
2231 : * the data, either, since it's most likely corrupt due to concurrent
2232 : * writes from garbage collection.
2233 : */
2234 90 : errno = 0;
2235 90 : if (read(fd, buf + nread, toread) != toread)
2236 : {
2237 0 : if (errno)
2238 0 : ereport(LOG,
2239 : (errcode_for_file_access(),
2240 : errmsg("could not read file \"%s\": %m",
2241 : PGSS_TEXT_FILE)));
2242 0 : free(buf);
2243 0 : CloseTransientFile(fd);
2244 0 : return NULL;
2245 : }
2246 90 : nread += toread;
2247 : }
2248 :
2249 92 : if (CloseTransientFile(fd) != 0)
2250 0 : ereport(LOG,
2251 : (errcode_for_file_access(),
2252 : errmsg("could not close file \"%s\": %m", PGSS_TEXT_FILE)));
2253 :
2254 92 : *buffer_size = nread;
2255 92 : return buf;
2256 : }
2257 :
2258 : /*
2259 : * Locate a query text in the file image previously read by qtext_load_file().
2260 : *
2261 : * We validate the given offset/length, and return NULL if bogus. Otherwise,
2262 : * the result points to a null-terminated string within the buffer.
2263 : */
2264 : static char *
2265 96724 : qtext_fetch(Size query_offset, int query_len,
2266 : char *buffer, Size buffer_size)
2267 : {
2268 : /* File read failed? */
2269 96724 : if (buffer == NULL)
2270 0 : return NULL;
2271 : /* Bogus offset/length? */
2272 96724 : if (query_len < 0 ||
2273 96724 : query_offset + query_len >= buffer_size)
2274 0 : return NULL;
2275 : /* As a further sanity check, make sure there's a trailing null */
2276 96724 : if (buffer[query_offset + query_len] != '\0')
2277 0 : return NULL;
2278 : /* Looks OK */
2279 96724 : return buffer + query_offset;
2280 : }
2281 :
2282 : /*
2283 : * Do we need to garbage-collect the external query text file?
2284 : *
2285 : * Caller should hold at least a shared lock on pgss->lock.
2286 : */
2287 : static bool
2288 49400 : need_gc_qtexts(void)
2289 : {
2290 : Size extent;
2291 :
2292 : /* Read shared extent pointer */
2293 : {
2294 49400 : volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
2295 :
2296 49400 : SpinLockAcquire(&s->mutex);
2297 49400 : extent = s->extent;
2298 49400 : SpinLockRelease(&s->mutex);
2299 : }
2300 :
2301 : /*
2302 : * Don't proceed if file does not exceed 512 bytes per possible entry.
2303 : *
2304 : * Here and in the next test, 32-bit machines have overflow hazards if
2305 : * pgss_max and/or mean_query_len are large. Force the multiplications
2306 : * and comparisons to be done in uint64 arithmetic to forestall trouble.
2307 : */
2308 49400 : if ((uint64) extent < (uint64) 512 * pgss_max)
2309 49400 : return false;
2310 :
2311 : /*
2312 : * Don't proceed if file is less than about 50% bloat. Nothing can or
2313 : * should be done in the event of unusually large query texts accounting
2314 : * for file's large size. We go to the trouble of maintaining the mean
2315 : * query length in order to prevent garbage collection from thrashing
2316 : * uselessly.
2317 : */
2318 0 : if ((uint64) extent < (uint64) pgss->mean_query_len * pgss_max * 2)
2319 0 : return false;
2320 :
2321 0 : return true;
2322 : }
2323 :
2324 : /*
2325 : * Garbage-collect orphaned query texts in external file.
2326 : *
2327 : * This won't be called often in the typical case, since it's likely that
2328 : * there won't be too much churn, and besides, a similar compaction process
2329 : * occurs when serializing to disk at shutdown or as part of resetting.
2330 : * Despite this, it seems prudent to plan for the edge case where the file
2331 : * becomes unreasonably large, with no other method of compaction likely to
2332 : * occur in the foreseeable future.
2333 : *
2334 : * The caller must hold an exclusive lock on pgss->lock.
2335 : *
2336 : * At the first sign of trouble we unlink the query text file to get a clean
2337 : * slate (although existing statistics are retained), rather than risk
2338 : * thrashing by allowing the same problem case to recur indefinitely.
2339 : */
2340 : static void
2341 0 : gc_qtexts(void)
2342 : {
2343 : char *qbuffer;
2344 : Size qbuffer_size;
2345 0 : FILE *qfile = NULL;
2346 : HASH_SEQ_STATUS hash_seq;
2347 : pgssEntry *entry;
2348 : Size extent;
2349 : int nentries;
2350 :
2351 : /*
2352 : * When called from pgss_store, some other session might have proceeded
2353 : * with garbage collection in the no-lock-held interim of lock strength
2354 : * escalation. Check once more that this is actually necessary.
2355 : */
2356 0 : if (!need_gc_qtexts())
2357 0 : return;
2358 :
2359 : /*
2360 : * Load the old texts file. If we fail (out of memory, for instance),
2361 : * invalidate query texts. Hopefully this is rare. It might seem better
2362 : * to leave things alone on an OOM failure, but the problem is that the
2363 : * file is only going to get bigger; hoping for a future non-OOM result is
2364 : * risky and can easily lead to complete denial of service.
2365 : */
2366 0 : qbuffer = qtext_load_file(&qbuffer_size);
2367 0 : if (qbuffer == NULL)
2368 0 : goto gc_fail;
2369 :
2370 : /*
2371 : * We overwrite the query texts file in place, so as to reduce the risk of
2372 : * an out-of-disk-space failure. Since the file is guaranteed not to get
2373 : * larger, this should always work on traditional filesystems; though we
2374 : * could still lose on copy-on-write filesystems.
2375 : */
2376 0 : qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
2377 0 : if (qfile == NULL)
2378 : {
2379 0 : ereport(LOG,
2380 : (errcode_for_file_access(),
2381 : errmsg("could not write file \"%s\": %m",
2382 : PGSS_TEXT_FILE)));
2383 0 : goto gc_fail;
2384 : }
2385 :
2386 0 : extent = 0;
2387 0 : nentries = 0;
2388 :
2389 0 : hash_seq_init(&hash_seq, pgss_hash);
2390 0 : while ((entry = hash_seq_search(&hash_seq)) != NULL)
2391 : {
2392 0 : int query_len = entry->query_len;
2393 0 : char *qry = qtext_fetch(entry->query_offset,
2394 : query_len,
2395 : qbuffer,
2396 : qbuffer_size);
2397 :
2398 0 : if (qry == NULL)
2399 : {
2400 : /* Trouble ... drop the text */
2401 0 : entry->query_offset = 0;
2402 0 : entry->query_len = -1;
2403 : /* entry will not be counted in mean query length computation */
2404 0 : continue;
2405 : }
2406 :
2407 0 : if (fwrite(qry, 1, query_len + 1, qfile) != query_len + 1)
2408 : {
2409 0 : ereport(LOG,
2410 : (errcode_for_file_access(),
2411 : errmsg("could not write file \"%s\": %m",
2412 : PGSS_TEXT_FILE)));
2413 0 : hash_seq_term(&hash_seq);
2414 0 : goto gc_fail;
2415 : }
2416 :
2417 0 : entry->query_offset = extent;
2418 0 : extent += query_len + 1;
2419 0 : nentries++;
2420 : }
2421 :
2422 : /*
2423 : * Truncate away any now-unused space. If this fails for some odd reason,
2424 : * we log it, but there's no need to fail.
2425 : */
2426 0 : if (ftruncate(fileno(qfile), extent) != 0)
2427 0 : ereport(LOG,
2428 : (errcode_for_file_access(),
2429 : errmsg("could not truncate file \"%s\": %m",
2430 : PGSS_TEXT_FILE)));
2431 :
2432 0 : if (FreeFile(qfile))
2433 : {
2434 0 : ereport(LOG,
2435 : (errcode_for_file_access(),
2436 : errmsg("could not write file \"%s\": %m",
2437 : PGSS_TEXT_FILE)));
2438 0 : qfile = NULL;
2439 0 : goto gc_fail;
2440 : }
2441 :
2442 0 : elog(DEBUG1, "pgss gc of queries file shrunk size from %zu to %zu",
2443 : pgss->extent, extent);
2444 :
2445 : /* Reset the shared extent pointer */
2446 0 : pgss->extent = extent;
2447 :
2448 : /*
2449 : * Also update the mean query length, to be sure that need_gc_qtexts()
2450 : * won't still think we have a problem.
2451 : */
2452 0 : if (nentries > 0)
2453 0 : pgss->mean_query_len = extent / nentries;
2454 : else
2455 0 : pgss->mean_query_len = ASSUMED_LENGTH_INIT;
2456 :
2457 0 : free(qbuffer);
2458 :
2459 : /*
2460 : * OK, count a garbage collection cycle. (Note: even though we have
2461 : * exclusive lock on pgss->lock, we must take pgss->mutex for this, since
2462 : * other processes may examine gc_count while holding only the mutex.
2463 : * Also, we have to advance the count *after* we've rewritten the file,
2464 : * else other processes might not realize they read a stale file.)
2465 : */
2466 0 : record_gc_qtexts();
2467 :
2468 0 : return;
2469 :
2470 0 : gc_fail:
2471 : /* clean up resources */
2472 0 : if (qfile)
2473 0 : FreeFile(qfile);
2474 0 : free(qbuffer);
2475 :
2476 : /*
2477 : * Since the contents of the external file are now uncertain, mark all
2478 : * hashtable entries as having invalid texts.
2479 : */
2480 0 : hash_seq_init(&hash_seq, pgss_hash);
2481 0 : while ((entry = hash_seq_search(&hash_seq)) != NULL)
2482 : {
2483 0 : entry->query_offset = 0;
2484 0 : entry->query_len = -1;
2485 : }
2486 :
2487 : /*
2488 : * Destroy the query text file and create a new, empty one
2489 : */
2490 0 : (void) unlink(PGSS_TEXT_FILE);
2491 0 : qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
2492 0 : if (qfile == NULL)
2493 0 : ereport(LOG,
2494 : (errcode_for_file_access(),
2495 : errmsg("could not recreate file \"%s\": %m",
2496 : PGSS_TEXT_FILE)));
2497 : else
2498 0 : FreeFile(qfile);
2499 :
2500 : /* Reset the shared extent pointer */
2501 0 : pgss->extent = 0;
2502 :
2503 : /* Reset mean_query_len to match the new state */
2504 0 : pgss->mean_query_len = ASSUMED_LENGTH_INIT;
2505 :
2506 : /*
2507 : * Bump the GC count even though we failed.
2508 : *
2509 : * This is needed to make concurrent readers of file without any lock on
2510 : * pgss->lock notice existence of new version of file. Once readers
2511 : * subsequently observe a change in GC count with pgss->lock held, that
2512 : * forces a safe reopen of file. Writers also require that we bump here,
2513 : * of course. (As required by locking protocol, readers and writers don't
2514 : * trust earlier file contents until gc_count is found unchanged after
2515 : * pgss->lock acquired in shared or exclusive mode respectively.)
2516 : */
2517 0 : record_gc_qtexts();
2518 : }
2519 :
2520 : /*
2521 : * Release entries corresponding to parameters passed.
2522 : */
2523 : static void
2524 82 : entry_reset(Oid userid, Oid dbid, uint64 queryid)
2525 : {
2526 : HASH_SEQ_STATUS hash_seq;
2527 : pgssEntry *entry;
2528 : FILE *qfile;
2529 : long num_entries;
2530 82 : long num_remove = 0;
2531 : pgssHashKey key;
2532 :
2533 82 : if (!pgss || !pgss_hash)
2534 0 : ereport(ERROR,
2535 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2536 : errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
2537 :
2538 82 : LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
2539 82 : num_entries = hash_get_num_entries(pgss_hash);
2540 :
2541 82 : if (userid != 0 && dbid != 0 && queryid != UINT64CONST(0))
2542 : {
2543 : /* If all the parameters are available, use the fast path. */
2544 2 : memset(&key, 0, sizeof(pgssHashKey));
2545 2 : key.userid = userid;
2546 2 : key.dbid = dbid;
2547 2 : key.queryid = queryid;
2548 :
2549 : /* Remove the key if it exists, starting with the top-level entry */
2550 2 : key.toplevel = false;
2551 2 : entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_REMOVE, NULL);
2552 2 : if (entry) /* found */
2553 0 : num_remove++;
2554 :
2555 : /* Also remove entries for top level statements */
2556 2 : key.toplevel = true;
2557 :
2558 : /* Remove the key if exists */
2559 2 : entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_REMOVE, NULL);
2560 2 : if (entry) /* found */
2561 2 : num_remove++;
2562 : }
2563 80 : else if (userid != 0 || dbid != 0 || queryid != UINT64CONST(0))
2564 : {
2565 : /* Remove entries corresponding to valid parameters. */
2566 6 : hash_seq_init(&hash_seq, pgss_hash);
2567 78 : while ((entry = hash_seq_search(&hash_seq)) != NULL)
2568 : {
2569 72 : if ((!userid || entry->key.userid == userid) &&
2570 52 : (!dbid || entry->key.dbid == dbid) &&
2571 48 : (!queryid || entry->key.queryid == queryid))
2572 : {
2573 8 : hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL);
2574 8 : num_remove++;
2575 : }
2576 : }
2577 : }
2578 : else
2579 : {
2580 : /* Remove all entries. */
2581 74 : hash_seq_init(&hash_seq, pgss_hash);
2582 672 : while ((entry = hash_seq_search(&hash_seq)) != NULL)
2583 : {
2584 598 : hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL);
2585 598 : num_remove++;
2586 : }
2587 : }
2588 :
2589 : /* All entries are removed? */
2590 82 : if (num_entries != num_remove)
2591 8 : goto release_lock;
2592 :
2593 : /*
2594 : * Reset global statistics for pg_stat_statements since all entries are
2595 : * removed.
2596 : */
2597 : {
2598 74 : volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
2599 74 : TimestampTz stats_reset = GetCurrentTimestamp();
2600 :
2601 74 : SpinLockAcquire(&s->mutex);
2602 74 : s->stats.dealloc = 0;
2603 74 : s->stats.stats_reset = stats_reset;
2604 74 : SpinLockRelease(&s->mutex);
2605 : }
2606 :
2607 : /*
2608 : * Write new empty query file, perhaps even creating a new one to recover
2609 : * if the file was missing.
2610 : */
2611 74 : qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
2612 74 : if (qfile == NULL)
2613 : {
2614 0 : ereport(LOG,
2615 : (errcode_for_file_access(),
2616 : errmsg("could not create file \"%s\": %m",
2617 : PGSS_TEXT_FILE)));
2618 0 : goto done;
2619 : }
2620 :
2621 : /* If ftruncate fails, log it, but it's not a fatal problem */
2622 74 : if (ftruncate(fileno(qfile), 0) != 0)
2623 0 : ereport(LOG,
2624 : (errcode_for_file_access(),
2625 : errmsg("could not truncate file \"%s\": %m",
2626 : PGSS_TEXT_FILE)));
2627 :
2628 74 : FreeFile(qfile);
2629 :
2630 74 : done:
2631 74 : pgss->extent = 0;
2632 : /* This counts as a query text garbage collection for our purposes */
2633 74 : record_gc_qtexts();
2634 :
2635 82 : release_lock:
2636 82 : LWLockRelease(pgss->lock);
2637 82 : }
2638 :
2639 : /*
2640 : * Generate a normalized version of the query string that will be used to
2641 : * represent all similar queries.
2642 : *
2643 : * Note that the normalized representation may well vary depending on
2644 : * just which "equivalent" query is used to create the hashtable entry.
2645 : * We assume this is OK.
2646 : *
2647 : * If query_loc > 0, then "query" has been advanced by that much compared to
2648 : * the original string start, so we need to translate the provided locations
2649 : * to compensate. (This lets us avoid re-scanning statements before the one
2650 : * of interest, so it's worth doing.)
2651 : *
2652 : * *query_len_p contains the input string length, and is updated with
2653 : * the result string length on exit. The resulting string might be longer
2654 : * or shorter depending on what happens with replacement of constants.
2655 : *
2656 : * Returns a palloc'd string.
2657 : */
2658 : static char *
2659 17604 : generate_normalized_query(JumbleState *jstate, const char *query,
2660 : int query_loc, int *query_len_p)
2661 : {
2662 : char *norm_query;
2663 17604 : int query_len = *query_len_p;
2664 : int i,
2665 : norm_query_buflen, /* Space allowed for norm_query */
2666 : len_to_wrt, /* Length (in bytes) to write */
2667 17604 : quer_loc = 0, /* Source query byte location */
2668 17604 : n_quer_loc = 0, /* Normalized query byte location */
2669 17604 : last_off = 0, /* Offset from start for previous tok */
2670 17604 : last_tok_len = 0; /* Length (in bytes) of that tok */
2671 :
2672 : /*
2673 : * Get constants' lengths (core system only gives us locations). Note
2674 : * this also ensures the items are sorted by location.
2675 : */
2676 17604 : fill_in_constant_lengths(jstate, query, query_loc);
2677 :
2678 : /*
2679 : * Allow for $n symbols to be longer than the constants they replace.
2680 : * Constants must take at least one byte in text form, while a $n symbol
2681 : * certainly isn't more than 11 bytes, even if n reaches INT_MAX. We
2682 : * could refine that limit based on the max value of n for the current
2683 : * query, but it hardly seems worth any extra effort to do so.
2684 : */
2685 17604 : norm_query_buflen = query_len + jstate->clocations_count * 10;
2686 :
2687 : /* Allocate result buffer */
2688 17604 : norm_query = palloc(norm_query_buflen + 1);
2689 :
2690 73258 : for (i = 0; i < jstate->clocations_count; i++)
2691 : {
2692 : int off, /* Offset from start for cur tok */
2693 : tok_len; /* Length (in bytes) of that tok */
2694 :
2695 55654 : off = jstate->clocations[i].location;
2696 : /* Adjust recorded location if we're dealing with partial string */
2697 55654 : off -= query_loc;
2698 :
2699 55654 : tok_len = jstate->clocations[i].length;
2700 :
2701 55654 : if (tok_len < 0)
2702 320 : continue; /* ignore any duplicates */
2703 :
2704 : /* Copy next chunk (what precedes the next constant) */
2705 55334 : len_to_wrt = off - last_off;
2706 55334 : len_to_wrt -= last_tok_len;
2707 :
2708 : Assert(len_to_wrt >= 0);
2709 55334 : memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
2710 55334 : n_quer_loc += len_to_wrt;
2711 :
2712 : /* And insert a param symbol in place of the constant token */
2713 110668 : n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d",
2714 55334 : i + 1 + jstate->highest_extern_param_id);
2715 :
2716 55334 : quer_loc = off + tok_len;
2717 55334 : last_off = off;
2718 55334 : last_tok_len = tok_len;
2719 : }
2720 :
2721 : /*
2722 : * We've copied up until the last ignorable constant. Copy over the
2723 : * remaining bytes of the original query string.
2724 : */
2725 17604 : len_to_wrt = query_len - quer_loc;
2726 :
2727 : Assert(len_to_wrt >= 0);
2728 17604 : memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
2729 17604 : n_quer_loc += len_to_wrt;
2730 :
2731 : Assert(n_quer_loc <= norm_query_buflen);
2732 17604 : norm_query[n_quer_loc] = '\0';
2733 :
2734 17604 : *query_len_p = n_quer_loc;
2735 17604 : return norm_query;
2736 : }
2737 :
2738 : /*
2739 : * Given a valid SQL string and an array of constant-location records,
2740 : * fill in the textual lengths of those constants.
2741 : *
2742 : * The constants may use any allowed constant syntax, such as float literals,
2743 : * bit-strings, single-quoted strings and dollar-quoted strings. This is
2744 : * accomplished by using the public API for the core scanner.
2745 : *
2746 : * It is the caller's job to ensure that the string is a valid SQL statement
2747 : * with constants at the indicated locations. Since in practice the string
2748 : * has already been parsed, and the locations that the caller provides will
2749 : * have originated from within the authoritative parser, this should not be
2750 : * a problem.
2751 : *
2752 : * Duplicate constant pointers are possible, and will have their lengths
2753 : * marked as '-1', so that they are later ignored. (Actually, we assume the
2754 : * lengths were initialized as -1 to start with, and don't change them here.)
2755 : *
2756 : * If query_loc > 0, then "query" has been advanced by that much compared to
2757 : * the original string start, so we need to translate the provided locations
2758 : * to compensate. (This lets us avoid re-scanning statements before the one
2759 : * of interest, so it's worth doing.)
2760 : *
2761 : * N.B. There is an assumption that a '-' character at a Const location begins
2762 : * a negative numeric constant. This precludes there ever being another
2763 : * reason for a constant to start with a '-'.
2764 : */
2765 : static void
2766 17604 : fill_in_constant_lengths(JumbleState *jstate, const char *query,
2767 : int query_loc)
2768 : {
2769 : LocationLen *locs;
2770 : core_yyscan_t yyscanner;
2771 : core_yy_extra_type yyextra;
2772 : core_YYSTYPE yylval;
2773 : YYLTYPE yylloc;
2774 17604 : int last_loc = -1;
2775 : int i;
2776 :
2777 : /*
2778 : * Sort the records by location so that we can process them in order while
2779 : * scanning the query text.
2780 : */
2781 17604 : if (jstate->clocations_count > 1)
2782 11510 : qsort(jstate->clocations, jstate->clocations_count,
2783 : sizeof(LocationLen), comp_location);
2784 17604 : locs = jstate->clocations;
2785 :
2786 : /* initialize the flex scanner --- should match raw_parser() */
2787 17604 : yyscanner = scanner_init(query,
2788 : &yyextra,
2789 : &ScanKeywords,
2790 : ScanKeywordTokens);
2791 :
2792 : /* we don't want to re-emit any escape string warnings */
2793 17604 : yyextra.escape_string_warning = false;
2794 :
2795 : /* Search for each constant, in sequence */
2796 73258 : for (i = 0; i < jstate->clocations_count; i++)
2797 : {
2798 55654 : int loc = locs[i].location;
2799 : int tok;
2800 :
2801 : /* Adjust recorded location if we're dealing with partial string */
2802 55654 : loc -= query_loc;
2803 :
2804 : Assert(loc >= 0);
2805 :
2806 55654 : if (loc <= last_loc)
2807 320 : continue; /* Duplicate constant, ignore */
2808 :
2809 : /* Lex tokens until we find the desired constant */
2810 : for (;;)
2811 : {
2812 414506 : tok = core_yylex(&yylval, &yylloc, yyscanner);
2813 :
2814 : /* We should not hit end-of-string, but if we do, behave sanely */
2815 414506 : if (tok == 0)
2816 0 : break; /* out of inner for-loop */
2817 :
2818 : /*
2819 : * We should find the token position exactly, but if we somehow
2820 : * run past it, work with that.
2821 : */
2822 414506 : if (yylloc >= loc)
2823 : {
2824 55334 : if (query[loc] == '-')
2825 : {
2826 : /*
2827 : * It's a negative value - this is the one and only case
2828 : * where we replace more than a single token.
2829 : *
2830 : * Do not compensate for the core system's special-case
2831 : * adjustment of location to that of the leading '-'
2832 : * operator in the event of a negative constant. It is
2833 : * also useful for our purposes to start from the minus
2834 : * symbol. In this way, queries like "select * from foo
2835 : * where bar = 1" and "select * from foo where bar = -2"
2836 : * will have identical normalized query strings.
2837 : */
2838 718 : tok = core_yylex(&yylval, &yylloc, yyscanner);
2839 718 : if (tok == 0)
2840 0 : break; /* out of inner for-loop */
2841 : }
2842 :
2843 : /*
2844 : * We now rely on the assumption that flex has placed a zero
2845 : * byte after the text of the current token in scanbuf.
2846 : */
2847 55334 : locs[i].length = strlen(yyextra.scanbuf + loc);
2848 55334 : break; /* out of inner for-loop */
2849 : }
2850 : }
2851 :
2852 : /* If we hit end-of-string, give up, leaving remaining lengths -1 */
2853 55334 : if (tok == 0)
2854 0 : break;
2855 :
2856 55334 : last_loc = loc;
2857 : }
2858 :
2859 17604 : scanner_finish(yyscanner);
2860 17604 : }
2861 :
2862 : /*
2863 : * comp_location: comparator for qsorting LocationLen structs by location
2864 : */
2865 : static int
2866 65986 : comp_location(const void *a, const void *b)
2867 : {
2868 65986 : int l = ((const LocationLen *) a)->location;
2869 65986 : int r = ((const LocationLen *) b)->location;
2870 :
2871 65986 : if (l < r)
2872 44942 : return -1;
2873 21044 : else if (l > r)
2874 20710 : return +1;
2875 : else
2876 334 : return 0;
2877 : }
|