Line data Source code
1 : /*------------------------------------------------------------------------- 2 : * 3 : * scansup.c 4 : * scanner support routines used by the core lexer 5 : * 6 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group 7 : * Portions Copyright (c) 1994, Regents of the University of California 8 : * 9 : * 10 : * IDENTIFICATION 11 : * src/backend/parser/scansup.c 12 : * 13 : *------------------------------------------------------------------------- 14 : */ 15 : #include "postgres.h" 16 : 17 : #include <ctype.h> 18 : 19 : #include "mb/pg_wchar.h" 20 : #include "parser/scansup.h" 21 : 22 : 23 : /* 24 : * downcase_truncate_identifier() --- do appropriate downcasing and 25 : * truncation of an unquoted identifier. Optionally warn of truncation. 26 : * 27 : * Returns a palloc'd string containing the adjusted identifier. 28 : * 29 : * Note: in some usages the passed string is not null-terminated. 30 : * 31 : * Note: the API of this function is designed to allow for downcasing 32 : * transformations that increase the string length, but we don't yet 33 : * support that. If you want to implement it, you'll need to fix 34 : * SplitIdentifierString() in utils/adt/varlena.c. 35 : */ 36 : char * 37 5940500 : downcase_truncate_identifier(const char *ident, int len, bool warn) 38 : { 39 5940500 : return downcase_identifier(ident, len, warn, true); 40 : } 41 : 42 : /* 43 : * a workhorse for downcase_truncate_identifier 44 : */ 45 : char * 46 5940602 : downcase_identifier(const char *ident, int len, bool warn, bool truncate) 47 : { 48 : char *result; 49 : int i; 50 : bool enc_is_single_byte; 51 : 52 5940602 : result = palloc(len + 1); 53 5940602 : enc_is_single_byte = pg_database_encoding_max_length() == 1; 54 : 55 : /* 56 : * SQL99 specifies Unicode-aware case normalization, which we don't yet 57 : * have the infrastructure for. Instead we use tolower() to provide a 58 : * locale-aware translation. However, there are some locales where this 59 : * is not right either (eg, Turkish may do strange things with 'i' and 60 : * 'I'). Our current compromise is to use tolower() for characters with 61 : * the high bit set, as long as they aren't part of a multi-byte 62 : * character, and use an ASCII-only downcasing for 7-bit characters. 63 : */ 64 47726280 : for (i = 0; i < len; i++) 65 : { 66 41785678 : unsigned char ch = (unsigned char) ident[i]; 67 : 68 41785678 : if (ch >= 'A' && ch <= 'Z') 69 1014438 : ch += 'a' - 'A'; 70 40771240 : else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch)) 71 0 : ch = tolower(ch); 72 41785678 : result[i] = (char) ch; 73 : } 74 5940602 : result[i] = '\0'; 75 : 76 5940602 : if (i >= NAMEDATALEN && truncate) 77 12 : truncate_identifier(result, i, warn); 78 : 79 5940602 : return result; 80 : } 81 : 82 : 83 : /* 84 : * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes. 85 : * 86 : * The given string is modified in-place, if necessary. A warning is 87 : * issued if requested. 88 : * 89 : * We require the caller to pass in the string length since this saves a 90 : * strlen() call in some common usages. 91 : */ 92 : void 93 540994 : truncate_identifier(char *ident, int len, bool warn) 94 : { 95 540994 : if (len >= NAMEDATALEN) 96 : { 97 14 : len = pg_mbcliplen(ident, len, NAMEDATALEN - 1); 98 14 : if (warn) 99 14 : ereport(NOTICE, 100 : (errcode(ERRCODE_NAME_TOO_LONG), 101 : errmsg("identifier \"%s\" will be truncated to \"%.*s\"", 102 : ident, len, ident))); 103 14 : ident[len] = '\0'; 104 : } 105 540994 : } 106 : 107 : /* 108 : * scanner_isspace() --- return true if flex scanner considers char whitespace 109 : * 110 : * This should be used instead of the potentially locale-dependent isspace() 111 : * function when it's important to match the lexer's behavior. 112 : * 113 : * In principle we might need similar functions for isalnum etc, but for the 114 : * moment only isspace seems needed. 115 : */ 116 : bool 117 31467830 : scanner_isspace(char ch) 118 : { 119 : /* This must match scan.l's list of {space} characters */ 120 31467830 : if (ch == ' ' || 121 31238830 : ch == '\t' || 122 31238654 : ch == '\n' || 123 31238636 : ch == '\r' || 124 31238630 : ch == '\v' || 125 : ch == '\f') 126 229206 : return true; 127 31238624 : return false; 128 : }