Line data Source code
1 : /*------------------------------------------------------------------------- 2 : * 3 : * scansup.c 4 : * scanner support routines used by the core lexer 5 : * 6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group 7 : * Portions Copyright (c) 1994, Regents of the University of California 8 : * 9 : * 10 : * IDENTIFICATION 11 : * src/backend/parser/scansup.c 12 : * 13 : *------------------------------------------------------------------------- 14 : */ 15 : #include "postgres.h" 16 : 17 : #include <ctype.h> 18 : 19 : #include "mb/pg_wchar.h" 20 : #include "parser/scansup.h" 21 : 22 : 23 : /* 24 : * downcase_truncate_identifier() --- do appropriate downcasing and 25 : * truncation of an unquoted identifier. Optionally warn of truncation. 26 : * 27 : * Returns a palloc'd string containing the adjusted identifier. 28 : * 29 : * Note: in some usages the passed string is not null-terminated. 30 : * 31 : * Note: the API of this function is designed to allow for downcasing 32 : * transformations that increase the string length, but we don't yet 33 : * support that. If you want to implement it, you'll need to fix 34 : * SplitIdentifierString() in utils/adt/varlena.c. 35 : */ 36 : char * 37 5997104 : downcase_truncate_identifier(const char *ident, int len, bool warn) 38 : { 39 5997104 : return downcase_identifier(ident, len, warn, true); 40 : } 41 : 42 : /* 43 : * a workhorse for downcase_truncate_identifier 44 : */ 45 : char * 46 5997206 : downcase_identifier(const char *ident, int len, bool warn, bool truncate) 47 : { 48 : char *result; 49 : int i; 50 : bool enc_is_single_byte; 51 : 52 5997206 : result = palloc(len + 1); 53 5997206 : enc_is_single_byte = pg_database_encoding_max_length() == 1; 54 : 55 : /* 56 : * SQL99 specifies Unicode-aware case normalization, which we don't yet 57 : * have the infrastructure for. Instead we use tolower() to provide a 58 : * locale-aware translation. However, there are some locales where this 59 : * is not right either (eg, Turkish may do strange things with 'i' and 60 : * 'I'). Our current compromise is to use tolower() for characters with 61 : * the high bit set, as long as they aren't part of a multi-byte 62 : * character, and use an ASCII-only downcasing for 7-bit characters. 63 : */ 64 48164744 : for (i = 0; i < len; i++) 65 : { 66 42167538 : unsigned char ch = (unsigned char) ident[i]; 67 : 68 42167538 : if (ch >= 'A' && ch <= 'Z') 69 1023272 : ch += 'a' - 'A'; 70 41144266 : else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch)) 71 0 : ch = tolower(ch); 72 42167538 : result[i] = (char) ch; 73 : } 74 5997206 : result[i] = '\0'; 75 : 76 5997206 : if (i >= NAMEDATALEN && truncate) 77 12 : truncate_identifier(result, i, warn); 78 : 79 5997206 : return result; 80 : } 81 : 82 : 83 : /* 84 : * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes. 85 : * 86 : * The given string is modified in-place, if necessary. A warning is 87 : * issued if requested. 88 : * 89 : * We require the caller to pass in the string length since this saves a 90 : * strlen() call in some common usages. 91 : */ 92 : void 93 563914 : truncate_identifier(char *ident, int len, bool warn) 94 : { 95 563914 : if (len >= NAMEDATALEN) 96 : { 97 14 : len = pg_mbcliplen(ident, len, NAMEDATALEN - 1); 98 14 : if (warn) 99 14 : ereport(NOTICE, 100 : (errcode(ERRCODE_NAME_TOO_LONG), 101 : errmsg("identifier \"%s\" will be truncated to \"%.*s\"", 102 : ident, len, ident))); 103 14 : ident[len] = '\0'; 104 : } 105 563914 : } 106 : 107 : /* 108 : * scanner_isspace() --- return true if flex scanner considers char whitespace 109 : * 110 : * This should be used instead of the potentially locale-dependent isspace() 111 : * function when it's important to match the lexer's behavior. 112 : * 113 : * In principle we might need similar functions for isalnum etc, but for the 114 : * moment only isspace seems needed. 115 : */ 116 : bool 117 31901522 : scanner_isspace(char ch) 118 : { 119 : /* This must match scan.l's list of {space} characters */ 120 31901522 : if (ch == ' ' || 121 31656646 : ch == '\t' || 122 31656470 : ch == '\n' || 123 31656452 : ch == '\r' || 124 31656446 : ch == '\v' || 125 : ch == '\f') 126 245082 : return true; 127 31656440 : return false; 128 : }