Line data Source code
1 : /* inline.h
2 : *
3 : * Copyright (C) 2012 by Larry Wall and others
4 : *
5 : * You may distribute under the terms of either the GNU General Public
6 : * License or the Artistic License, as specified in the README file.
7 : *
8 : * This file contains tables and code adapted from
9 : * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
10 : * copyright notice:
11 :
12 : Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13 :
14 : Permission is hereby granted, free of charge, to any person obtaining a copy of
15 : this software and associated documentation files (the "Software"), to deal in
16 : the Software without restriction, including without limitation the rights to
17 : use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18 : of the Software, and to permit persons to whom the Software is furnished to do
19 : so, subject to the following conditions:
20 :
21 : The above copyright notice and this permission notice shall be included in all
22 : copies or substantial portions of the Software.
23 :
24 : THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 : IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 : FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 : AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 : LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29 : OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 : SOFTWARE.
31 :
32 : *
33 : * This file is a home for static inline functions that cannot go in other
34 : * header files, because they depend on proto.h (included after most other
35 : * headers) or struct definitions.
36 : *
37 : * Note also perlstatic.h for functions that can't or shouldn't be inlined, but
38 : * whose details should be exposed to the compiler, for such things as tail
39 : * call optimization.
40 : *
41 : * Each section names the header file that the functions "belong" to.
42 : */
43 :
44 : /* ------------------------------- av.h ------------------------------- */
45 :
46 : /*
47 : =for apidoc_section $AV
48 : =for apidoc av_count
49 : Returns the number of elements in the array C<av>. This is the true length of
50 : the array, including any undefined elements. It is always the same as
51 : S<C<av_top_index(av) + 1>>.
52 :
53 : =cut
54 : */
55 : PERL_STATIC_INLINE Size_t
56 : Perl_av_count(pTHX_ AV *av)
57 : {
58 : PERL_ARGS_ASSERT_AV_COUNT;
59 : assert(SvTYPE(av) == SVt_PVAV);
60 :
61 : return AvFILL(av) + 1;
62 : }
63 :
64 : /* ------------------------------- av.c ------------------------------- */
65 :
66 : /*
67 : =for apidoc av_store_simple
68 :
69 : This is a cut-down version of av_store that assumes that the array is
70 : very straightforward - no magic, not readonly, and AvREAL - and that
71 : C<key> is not negative. This function MUST NOT be used in situations
72 : where any of those assumptions may not hold.
73 :
74 : Stores an SV in an array. The array index is specified as C<key>. It
75 : can be dereferenced to get the C<SV*> that was stored there (= C<val>)).
76 :
77 : Note that the caller is responsible for suitably incrementing the reference
78 : count of C<val> before the call.
79 :
80 : Approximate Perl equivalent: C<splice(@myarray, $key, 1, $val)>.
81 :
82 : =cut
83 : */
84 :
85 : PERL_STATIC_INLINE SV**
86 : Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val)
87 : {
88 : SV** ary;
89 :
90 : PERL_ARGS_ASSERT_AV_STORE_SIMPLE;
91 : assert(SvTYPE(av) == SVt_PVAV);
92 : assert(!SvMAGICAL(av));
93 : assert(!SvREADONLY(av));
94 : assert(AvREAL(av));
95 : assert(key > -1);
96 :
97 : ary = AvARRAY(av);
98 :
99 : if (AvFILLp(av) < key) {
100 : if (key > AvMAX(av)) {
101 : av_extend(av,key);
102 : ary = AvARRAY(av);
103 : }
104 : AvFILLp(av) = key;
105 : } else
106 : SvREFCNT_dec(ary[key]);
107 :
108 : ary[key] = val;
109 : return &ary[key];
110 : }
111 :
112 : /*
113 : =for apidoc av_fetch_simple
114 :
115 : This is a cut-down version of av_fetch that assumes that the array is
116 : very straightforward - no magic, not readonly, and AvREAL - and that
117 : C<key> is not negative. This function MUST NOT be used in situations
118 : where any of those assumptions may not hold.
119 :
120 : Returns the SV at the specified index in the array. The C<key> is the
121 : index. If lval is true, you are guaranteed to get a real SV back (in case
122 : it wasn't real before), which you can then modify. Check that the return
123 : value is non-null before dereferencing it to a C<SV*>.
124 :
125 : The rough perl equivalent is C<$myarray[$key]>.
126 :
127 : =cut
128 : */
129 :
130 : PERL_STATIC_INLINE SV**
131 : Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval)
132 : {
133 : PERL_ARGS_ASSERT_AV_FETCH_SIMPLE;
134 : assert(SvTYPE(av) == SVt_PVAV);
135 : assert(!SvMAGICAL(av));
136 : assert(!SvREADONLY(av));
137 : assert(AvREAL(av));
138 : assert(key > -1);
139 :
140 : if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) {
141 : return lval ? av_store_simple(av,key,newSV_type(SVt_NULL)) : NULL;
142 : } else {
143 : return &AvARRAY(av)[key];
144 : }
145 : }
146 :
147 : /*
148 : =for apidoc av_push_simple
149 :
150 : This is a cut-down version of av_push that assumes that the array is very
151 : straightforward - no magic, not readonly, and AvREAL - and that C<key> is
152 : not less than -1. This function MUST NOT be used in situations where any
153 : of those assumptions may not hold.
154 :
155 : Pushes an SV (transferring control of one reference count) onto the end of the
156 : array. The array will grow automatically to accommodate the addition.
157 :
158 : Perl equivalent: C<push @myarray, $val;>.
159 :
160 : =cut
161 : */
162 :
163 : PERL_STATIC_INLINE void
164 : Perl_av_push_simple(pTHX_ AV *av, SV *val)
165 : {
166 : PERL_ARGS_ASSERT_AV_PUSH_SIMPLE;
167 : assert(SvTYPE(av) == SVt_PVAV);
168 : assert(!SvMAGICAL(av));
169 : assert(!SvREADONLY(av));
170 : assert(AvREAL(av));
171 : assert(AvFILLp(av) > -2);
172 :
173 : (void)av_store_simple(av,AvFILLp(av)+1,val);
174 : }
175 :
176 : /*
177 : =for apidoc av_new_alloc
178 :
179 : This implements L<perlapi/C<newAV_alloc_x>>
180 : and L<perlapi/C<newAV_alloc_xz>>, which are the public API for this
181 : functionality.
182 :
183 : Creates a new AV and allocates its SV* array.
184 :
185 : This is similar to, but more efficient than doing:
186 :
187 : AV *av = newAV();
188 : av_extend(av, key);
189 :
190 : The size parameter is used to pre-allocate a SV* array large enough to
191 : hold at least elements C<0..(size-1)>. C<size> must be at least 1.
192 :
193 : The C<zeroflag> parameter controls whether or not the array is NULL
194 : initialized.
195 :
196 : =cut
197 : */
198 :
199 : PERL_STATIC_INLINE AV *
200 : Perl_av_new_alloc(pTHX_ SSize_t size, bool zeroflag)
201 : {
202 : AV * const av = newAV();
203 : SV** ary;
204 : PERL_ARGS_ASSERT_AV_NEW_ALLOC;
205 : assert(size > 0);
206 :
207 : Newx(ary, size, SV*); /* Newx performs the memwrap check */
208 : AvALLOC(av) = ary;
209 : AvARRAY(av) = ary;
210 : AvMAX(av) = size - 1;
211 :
212 : if (zeroflag)
213 : Zero(ary, size, SV*);
214 :
215 : return av;
216 : }
217 :
218 :
219 : /* remove (AvARRAY(av) - AvALLOC(av)) offset from empty array */
220 :
221 : PERL_STATIC_INLINE void
222 : Perl_av_remove_offset(pTHX_ AV *av)
223 : {
224 : PERL_ARGS_ASSERT_AV_REMOVE_OFFSET;
225 : assert(AvFILLp(av) == -1);
226 : SSize_t i = AvARRAY(av) - AvALLOC(av);
227 : if (i) {
228 : AvARRAY(av) = AvALLOC(av);
229 : AvMAX(av) += i;
230 : #ifdef PERL_RC_STACK
231 : Zero(AvALLOC(av), i, SV*);
232 : #endif
233 : }
234 : }
235 :
236 :
237 : /* ------------------------------- cv.h ------------------------------- */
238 :
239 : /*
240 : =for apidoc_section $CV
241 : =for apidoc CvGV
242 : Returns the GV associated with the CV C<sv>, reifying it if necessary.
243 :
244 : =cut
245 : */
246 : PERL_STATIC_INLINE GV *
247 : Perl_CvGV(pTHX_ CV *sv)
248 : {
249 : PERL_ARGS_ASSERT_CVGV;
250 :
251 : return CvNAMED(sv)
252 : ? Perl_cvgv_from_hek(aTHX_ sv)
253 : : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
254 : }
255 :
256 : /*
257 : =for apidoc CvDEPTH
258 : Returns the recursion level of the CV C<sv>. Hence >= 2 indicates we are in a
259 : recursive call.
260 :
261 : =cut
262 : */
263 : PERL_STATIC_INLINE I32 *
264 : Perl_CvDEPTH(const CV * const sv)
265 : {
266 : PERL_ARGS_ASSERT_CVDEPTH;
267 : assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
268 :
269 : return &((XPVCV*)SvANY(sv))->xcv_depth;
270 : }
271 :
272 : /*
273 : CvPROTO returns the prototype as stored, which is not necessarily what
274 : the interpreter should be using. Specifically, the interpreter assumes
275 : that spaces have been stripped, which has been the case if the prototype
276 : was added by toke.c, but is generally not the case if it was added elsewhere.
277 : Since we can't enforce the spacelessness at assignment time, this routine
278 : provides a temporary copy at parse time with spaces removed.
279 : I<orig> is the start of the original buffer, I<len> is the length of the
280 : prototype and will be updated when this returns.
281 : */
282 :
283 : #ifdef PERL_CORE
284 : PERL_STATIC_INLINE char *
285 : S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
286 : {
287 : SV * tmpsv;
288 : char * tmps;
289 : tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
290 : tmps = SvPVX(tmpsv);
291 : while ((*len)--) {
292 : if (!isSPACE(*orig))
293 : *tmps++ = *orig;
294 : orig++;
295 : }
296 : *tmps = '\0';
297 : *len = tmps - SvPVX(tmpsv);
298 : return SvPVX(tmpsv);
299 : }
300 : #endif
301 :
302 : /* ------------------------------- iperlsys.h ------------------------------- */
303 : #if ! defined(PERL_IMPLICIT_SYS) && defined(USE_ITHREADS)
304 :
305 : /* Otherwise this function is implemented as macros in iperlsys.h */
306 :
307 : PERL_STATIC_INLINE bool
308 : S_PerlEnv_putenv(pTHX_ char * str)
309 : {
310 : PERL_ARGS_ASSERT_PERLENV_PUTENV;
311 :
312 : ENV_LOCK;
313 : bool retval = putenv(str);
314 : ENV_UNLOCK;
315 :
316 : return retval;
317 : }
318 :
319 : #endif
320 :
321 : /* ------------------------------- mg.h ------------------------------- */
322 :
323 : #if defined(PERL_CORE) || defined(PERL_EXT)
324 : /* assumes get-magic and stringification have already occurred */
325 : PERL_STATIC_INLINE STRLEN
326 : S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
327 : {
328 : assert(mg->mg_type == PERL_MAGIC_regex_global);
329 : assert(mg->mg_len != -1);
330 : if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
331 : return (STRLEN)mg->mg_len;
332 : else {
333 : const STRLEN pos = (STRLEN)mg->mg_len;
334 : /* Without this check, we may read past the end of the buffer: */
335 : if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
336 : return sv_or_pv_pos_u2b(sv, s, pos, NULL);
337 : }
338 : }
339 : #endif
340 :
341 : /* ------------------------------- pad.h ------------------------------ */
342 :
343 : #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
344 : PERL_STATIC_INLINE bool
345 : S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
346 : {
347 : PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
348 :
349 : /* is seq within the range _LOW to _HIGH ?
350 : * This is complicated by the fact that PL_cop_seqmax
351 : * may have wrapped around at some point */
352 : if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
353 : return FALSE; /* not yet introduced */
354 :
355 : if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
356 : /* in compiling scope */
357 : if (
358 : (seq > COP_SEQ_RANGE_LOW(pn))
359 : ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
360 : : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
361 : )
362 : return TRUE;
363 : }
364 : else if (
365 : (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
366 : ?
367 : ( seq > COP_SEQ_RANGE_LOW(pn)
368 : || seq <= COP_SEQ_RANGE_HIGH(pn))
369 :
370 : : ( seq > COP_SEQ_RANGE_LOW(pn)
371 : && seq <= COP_SEQ_RANGE_HIGH(pn))
372 : )
373 : return TRUE;
374 : return FALSE;
375 : }
376 : #endif
377 :
378 : /* ------------------------------- pp.h ------------------------------- */
379 :
380 : PERL_STATIC_INLINE Stack_off_t
381 : Perl_TOPMARK(pTHX)
382 : {
383 : DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
384 : "MARK top %p %" IVdf "\n",
385 : PL_markstack_ptr,
386 : (IV)*PL_markstack_ptr)));
387 : return *PL_markstack_ptr;
388 : }
389 :
390 : PERL_STATIC_INLINE Stack_off_t
391 706 : Perl_POPMARK(pTHX)
392 : {
393 : DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
394 : "MARK pop %p %" IVdf "\n",
395 : (PL_markstack_ptr-1),
396 : (IV)*(PL_markstack_ptr-1))));
397 : assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
398 706 : return *PL_markstack_ptr--;
399 : }
400 :
401 : /*
402 : =for apidoc_section $rpp
403 :
404 : =for apidoc rpp_extend
405 : Ensures that there is space on the stack to push C<n> items, extending it
406 : if necessary.
407 :
408 : =cut
409 : */
410 :
411 : PERL_STATIC_INLINE void
412 : Perl_rpp_extend(pTHX_ SSize_t n)
413 : {
414 : PERL_ARGS_ASSERT_RPP_EXTEND;
415 :
416 : EXTEND_HWM_SET(PL_stack_sp, n);
417 : #ifndef STRESS_REALLOC
418 : if (UNLIKELY(_EXTEND_NEEDS_GROW(PL_stack_sp, n)))
419 : #endif
420 : {
421 : (void)stack_grow(PL_stack_sp, PL_stack_sp, n);
422 : }
423 : }
424 :
425 :
426 : /*
427 : =for apidoc rpp_popfree_to
428 :
429 : Pop and free all items on the argument stack above C<sp>. On return,
430 : C<PL_stack_sp> will be equal to C<sp>.
431 :
432 : =cut
433 : */
434 :
435 : PERL_STATIC_INLINE void
436 : Perl_rpp_popfree_to(pTHX_ SV **sp)
437 : {
438 : PERL_ARGS_ASSERT_RPP_POPFREE_TO;
439 :
440 : assert(sp <= PL_stack_sp);
441 : #ifdef PERL_RC_STACK
442 : assert(rpp_stack_is_rc());
443 : while (PL_stack_sp > sp) {
444 : SV *sv = *PL_stack_sp--;
445 : SvREFCNT_dec(sv);
446 : }
447 : #else
448 : PL_stack_sp = sp;
449 : #endif
450 : }
451 :
452 :
453 : /*
454 : =for apidoc rpp_popfree_to_NN
455 :
456 : A variant of rpp_popfree_to() which assumes that all the pointers being
457 : popped off the stack are non-NULL.
458 :
459 : =cut
460 : */
461 :
462 : PERL_STATIC_INLINE void
463 : Perl_rpp_popfree_to_NN(pTHX_ SV **sp)
464 : {
465 : PERL_ARGS_ASSERT_RPP_POPFREE_TO_NN;
466 :
467 : assert(sp <= PL_stack_sp);
468 : #ifdef PERL_RC_STACK
469 : assert(rpp_stack_is_rc());
470 : while (PL_stack_sp > sp) {
471 : SV *sv = *PL_stack_sp--;
472 : assert(sv);
473 : SvREFCNT_dec_NN(sv);
474 : }
475 : #else
476 : PL_stack_sp = sp;
477 : #endif
478 : }
479 :
480 :
481 : /*
482 : =for apidoc rpp_popfree_1
483 :
484 : Pop and free the top item on the argument stack and update C<PL_stack_sp>.
485 :
486 : =cut
487 : */
488 :
489 : PERL_STATIC_INLINE void
490 : Perl_rpp_popfree_1(pTHX)
491 : {
492 : PERL_ARGS_ASSERT_RPP_POPFREE_1;
493 :
494 : #ifdef PERL_RC_STACK
495 : assert(rpp_stack_is_rc());
496 : SV *sv = *PL_stack_sp--;
497 : SvREFCNT_dec(sv);
498 : #else
499 : PL_stack_sp--;
500 : #endif
501 : }
502 :
503 :
504 : /*
505 : =for apidoc rpp_popfree_1_NN
506 :
507 : A variant of rpp_popfree_1() which assumes that the pointer being popped
508 : off the stack is non-NULL.
509 :
510 : =cut
511 : */
512 :
513 : PERL_STATIC_INLINE void
514 : Perl_rpp_popfree_1_NN(pTHX)
515 : {
516 : PERL_ARGS_ASSERT_RPP_POPFREE_1_NN;
517 :
518 : assert(*PL_stack_sp);
519 : #ifdef PERL_RC_STACK
520 : assert(rpp_stack_is_rc());
521 : SV *sv = *PL_stack_sp--;
522 : SvREFCNT_dec_NN(sv);
523 : #else
524 : PL_stack_sp--;
525 : #endif
526 : }
527 :
528 :
529 : /*
530 : =for apidoc rpp_popfree_2
531 :
532 : Pop and free the top two items on the argument stack and update
533 : C<PL_stack_sp>.
534 :
535 : =cut
536 : */
537 :
538 :
539 : PERL_STATIC_INLINE void
540 : Perl_rpp_popfree_2(pTHX)
541 : {
542 : PERL_ARGS_ASSERT_RPP_POPFREE_2;
543 :
544 : #ifdef PERL_RC_STACK
545 : assert(rpp_stack_is_rc());
546 : for (int i = 0; i < 2; i++) {
547 : SV *sv = *PL_stack_sp--;
548 : SvREFCNT_dec(sv);
549 : }
550 : #else
551 : PL_stack_sp -= 2;
552 : #endif
553 : }
554 :
555 :
556 : /*
557 : =for apidoc rpp_popfree_2_NN
558 :
559 : A variant of rpp_popfree_2() which assumes that the two pointers being
560 : popped off the stack are non-NULL.
561 :
562 : =cut
563 : */
564 :
565 :
566 : PERL_STATIC_INLINE void
567 : Perl_rpp_popfree_2_NN(pTHX)
568 : {
569 : PERL_ARGS_ASSERT_RPP_POPFREE_2_NN;
570 : #ifdef PERL_RC_STACK
571 : SV *sv2 = *PL_stack_sp--;
572 : assert(sv2);
573 : SV *sv1 = *PL_stack_sp;
574 : assert(sv1);
575 :
576 : assert(rpp_stack_is_rc());
577 : U32 rc1 = SvREFCNT(sv1);
578 : U32 rc2 = SvREFCNT(sv2);
579 : /* This expression is intended to be true if either of rc1 or rc2 has
580 : * the value 0 or 1, but using only a single branch test, rather
581 : * than the two branches that a compiler would plant for a boolean
582 : * expression. We are working on the assumption that, most of the
583 : * time, neither of the args to a binary function will need to be
584 : * freed - they're likely to lex vars, or PADTMPs or whatever.
585 : * So give the CPU a single branch that is rarely taken. */
586 : if (UNLIKELY( !(rc1>>1) + !(rc2>>1) ))
587 : /* at least one of the old SVs needs freeing. Do it the long way */
588 : Perl_rpp_free_2_(aTHX_ sv1, sv2, rc1, rc2);
589 : else {
590 : SvREFCNT(sv1) = rc1 - 1;
591 : SvREFCNT(sv2) = rc2 - 1;
592 : }
593 : PL_stack_sp--;
594 : #else
595 : PL_stack_sp -= 2;
596 : #endif
597 : }
598 :
599 :
600 : /*
601 : =for apidoc rpp_pop_1_norc
602 :
603 : Pop and return the top item off the argument stack and update
604 : C<PL_stack_sp>. It's similar to rpp_popfree_1(), except that it actually
605 : returns a value, and it I<doesn't> decrement the SV's reference count.
606 : On non-C<PERL_RC_STACK> builds it actually increments the SV's reference
607 : count.
608 :
609 : This is useful in cases where the popped value is immediately embedded
610 : somewhere e.g. via av_store(), allowing you skip decrementing and then
611 : immediately incrementing the reference count again (and risk prematurely
612 : freeing the SV if it had a RC of 1). On non-RC builds, the reference count
613 : bookkeeping still works too, which is why it should be used rather than
614 : a simple C<*PL_stack_sp-->.
615 :
616 : =cut
617 : */
618 :
619 : PERL_STATIC_INLINE SV*
620 : Perl_rpp_pop_1_norc(pTHX)
621 : {
622 : PERL_ARGS_ASSERT_RPP_POP_1_NORC
623 :
624 : SV *sv = *PL_stack_sp--;
625 :
626 : #ifndef PERL_RC_STACK
627 : SvREFCNT_inc(sv);
628 : #else
629 : assert(rpp_stack_is_rc());
630 : #endif
631 : return sv;
632 : }
633 :
634 :
635 :
636 : /*
637 : =for apidoc rpp_push_1
638 : =for apidoc_item rpp_push_IMM
639 : =for apidoc_item rpp_push_2
640 : =for apidoc_item rpp_xpush_1
641 : =for apidoc_item rpp_xpush_IMM
642 : =for apidoc_item rpp_xpush_2
643 :
644 : Push one or two SVs onto the stack, incrementing their reference counts
645 : and updating C<PL_stack_sp>. With the C<x> variants, it extends the stack
646 : first. The C<IMM> variants assume that the single argument is an immortal
647 : such as <&PL_sv_undef> and, for efficiency, will skip incrementing its
648 : reference count.
649 :
650 : =cut
651 : */
652 :
653 : PERL_STATIC_INLINE void
654 : Perl_rpp_push_1(pTHX_ SV *sv)
655 : {
656 : PERL_ARGS_ASSERT_RPP_PUSH_1;
657 :
658 : *++PL_stack_sp = sv;
659 : #ifdef PERL_RC_STACK
660 : assert(rpp_stack_is_rc());
661 : SvREFCNT_inc_simple_void_NN(sv);
662 : #endif
663 : }
664 :
665 : PERL_STATIC_INLINE void
666 : Perl_rpp_push_IMM(pTHX_ SV *sv)
667 : {
668 : PERL_ARGS_ASSERT_RPP_PUSH_IMM;
669 :
670 : assert(SvIMMORTAL(sv));
671 : *++PL_stack_sp = sv;
672 : #ifdef PERL_RC_STACK
673 : assert(rpp_stack_is_rc());
674 : #endif
675 : }
676 :
677 : PERL_STATIC_INLINE void
678 : Perl_rpp_push_2(pTHX_ SV *sv1, SV *sv2)
679 : {
680 : PERL_ARGS_ASSERT_RPP_PUSH_2;
681 :
682 : *++PL_stack_sp = sv1;
683 : *++PL_stack_sp = sv2;
684 : #ifdef PERL_RC_STACK
685 : assert(rpp_stack_is_rc());
686 : SvREFCNT_inc_simple_void_NN(sv1);
687 : SvREFCNT_inc_simple_void_NN(sv2);
688 : #endif
689 : }
690 :
691 : PERL_STATIC_INLINE void
692 : Perl_rpp_xpush_1(pTHX_ SV *sv)
693 : {
694 : PERL_ARGS_ASSERT_RPP_XPUSH_1;
695 :
696 : rpp_extend(1);
697 : rpp_push_1(sv);
698 : }
699 :
700 : PERL_STATIC_INLINE void
701 : Perl_rpp_xpush_IMM(pTHX_ SV *sv)
702 : {
703 : PERL_ARGS_ASSERT_RPP_XPUSH_IMM;
704 :
705 : rpp_extend(1);
706 : rpp_push_IMM(sv);
707 : }
708 :
709 : PERL_STATIC_INLINE void
710 : Perl_rpp_xpush_2(pTHX_ SV *sv1, SV *sv2)
711 : {
712 : PERL_ARGS_ASSERT_RPP_XPUSH_2;
713 :
714 : rpp_extend(2);
715 : rpp_push_2(sv1, sv2);
716 : }
717 :
718 :
719 : /*
720 : =for apidoc rpp_push_1_norc
721 :
722 : Push C<sv> onto the stack without incrementing its reference count, and
723 : update C<PL_stack_sp>. On non-PERL_RC_STACK builds, mortalise too.
724 :
725 : This is most useful where an SV has just been created and already has a
726 : reference count of 1, but has not yet been anchored anywhere.
727 :
728 : =cut
729 : */
730 :
731 : PERL_STATIC_INLINE void
732 : Perl_rpp_push_1_norc(pTHX_ SV *sv)
733 : {
734 : PERL_ARGS_ASSERT_RPP_PUSH_1;
735 :
736 : *++PL_stack_sp = sv;
737 : #ifdef PERL_RC_STACK
738 : assert(rpp_stack_is_rc());
739 : #else
740 : sv_2mortal(sv);
741 : #endif
742 : }
743 :
744 :
745 : /*
746 : =for apidoc rpp_replace_1_1
747 : =for apidoc_item rpp_replace_1_1_NN
748 : =for apidoc_item rpp_replace_1_IMM_NN
749 :
750 : Replace the current top stack item with C<sv>, while suitably adjusting
751 : reference counts. Equivalent to rpp_popfree_1(); rpp_push_1(sv), but
752 : is more efficient and handles both SVs being the same.
753 :
754 : The C<_NN> variant assumes that the pointer on the stack to the SV being
755 : freed is non-NULL.
756 :
757 : The C<IMM_NN> variant is like the C<_NN> variant, but in addition, assumes
758 : that the single argument is an immortal such as <&PL_sv_undef> and, for
759 : efficiency, will skip incrementing its reference count.
760 :
761 : =cut
762 : */
763 :
764 : PERL_STATIC_INLINE void
765 : Perl_rpp_replace_1_1(pTHX_ SV *sv)
766 : {
767 : PERL_ARGS_ASSERT_RPP_REPLACE_1_1;
768 :
769 : assert(sv);
770 : #ifdef PERL_RC_STACK
771 : assert(rpp_stack_is_rc());
772 : SV *oldsv = *PL_stack_sp;
773 : *PL_stack_sp = sv;
774 : SvREFCNT_inc_simple_void_NN(sv);
775 : SvREFCNT_dec(oldsv);
776 : #else
777 : *PL_stack_sp = sv;
778 : #endif
779 : }
780 :
781 :
782 : PERL_STATIC_INLINE void
783 : Perl_rpp_replace_1_1_NN(pTHX_ SV *sv)
784 : {
785 : PERL_ARGS_ASSERT_RPP_REPLACE_1_1_NN;
786 :
787 : assert(sv);
788 : assert(*PL_stack_sp);
789 : #ifdef PERL_RC_STACK
790 : assert(rpp_stack_is_rc());
791 : SV *oldsv = *PL_stack_sp;
792 : *PL_stack_sp = sv;
793 : SvREFCNT_inc_simple_void_NN(sv);
794 : SvREFCNT_dec_NN(oldsv);
795 : #else
796 : *PL_stack_sp = sv;
797 : #endif
798 : }
799 :
800 :
801 : PERL_STATIC_INLINE void
802 : Perl_rpp_replace_1_IMM_NN(pTHX_ SV *sv)
803 : {
804 : PERL_ARGS_ASSERT_RPP_REPLACE_1_IMM_NN;
805 :
806 : assert(sv);
807 : assert(SvIMMORTAL(sv));
808 : assert(*PL_stack_sp);
809 : #ifdef PERL_RC_STACK
810 : assert(rpp_stack_is_rc());
811 : SV *oldsv = *PL_stack_sp;
812 : *PL_stack_sp = sv;
813 : SvREFCNT_dec_NN(oldsv);
814 : #else
815 : *PL_stack_sp = sv;
816 : #endif
817 : }
818 :
819 :
820 : /*
821 : =for apidoc rpp_replace_2_1
822 : =for apidoc_item rpp_replace_2_1_NN
823 : =for apidoc_item rpp_replace_2_IMM_NN
824 :
825 : Replace the current top to stacks item with C<sv>, while suitably
826 : adjusting reference counts. Equivalent to rpp_popfree_2(); rpp_push_1(sv),
827 : but is more efficient and handles SVs being the same.
828 :
829 : The C<_NN> variant assumes that the pointers on the stack to the SVs being
830 : freed are non-NULL.
831 :
832 : The C<IMM_NN> variant is like the C<_NN> variant, but in addition, assumes
833 : that the single argument is an immortal such as <&PL_sv_undef> and, for
834 : efficiency, will skip incrementing its reference count.
835 : =cut
836 : */
837 :
838 : PERL_STATIC_INLINE void
839 : Perl_rpp_replace_2_1(pTHX_ SV *sv)
840 : {
841 : PERL_ARGS_ASSERT_RPP_REPLACE_2_1;
842 :
843 : #ifdef PERL_RC_STACK
844 : assert(rpp_stack_is_rc());
845 : /* replace PL_stack_sp[-1] first; leave PL_stack_sp[0] in place while
846 : * we free [-1], so if an exception occurs, [0] will still be freed.
847 : */
848 : SV *oldsv = PL_stack_sp[-1];
849 : PL_stack_sp[-1] = sv;
850 : SvREFCNT_inc_simple_void_NN(sv);
851 : SvREFCNT_dec(oldsv);
852 : oldsv = *PL_stack_sp--;
853 : SvREFCNT_dec(oldsv);
854 : #else
855 : *--PL_stack_sp = sv;
856 : #endif
857 : }
858 :
859 :
860 : /* Private helper function for _NN and _IMM_NN variants.
861 : * Assumes sv has already had its ref count incremented,
862 : * ready for being put on the stack.
863 : * Intended to be small and fast, since it's inlined into many hot parts of
864 : * code.
865 : */
866 :
867 : PERL_STATIC_INLINE void
868 : Perl_rpp_replace_2_1_COMMON(pTHX_ SV *sv)
869 : {
870 :
871 : assert(sv);
872 : #ifdef PERL_RC_STACK
873 : SV *sv2 = *PL_stack_sp--;
874 : assert(sv2);
875 : SV *sv1 = *PL_stack_sp;
876 : assert(sv1);
877 :
878 : *PL_stack_sp = sv;
879 : assert(rpp_stack_is_rc());
880 : U32 rc1 = SvREFCNT(sv1);
881 : U32 rc2 = SvREFCNT(sv2);
882 : /* This expression is intended to be true if either of rc1 or rc2 has
883 : * the value 0 or 1, but using only a single branch test, rather
884 : * than the two branches that a compiler would plant for a boolean
885 : * expression. We are working on the assumption that, most of the
886 : * time, neither of the args to a binary function will need to be
887 : * freed - they're likely to lex vars, or PADTMPs or whatever.
888 : * So give the CPU a single branch that is rarely taken. */
889 : if (UNLIKELY( !(rc1>>1) + !(rc2>>1) ))
890 : /* at least one of the old SVs needs freeing. Do it the long way */
891 : Perl_rpp_free_2_(aTHX_ sv1, sv2, rc1, rc2);
892 : else {
893 : SvREFCNT(sv1) = rc1 - 1;
894 : SvREFCNT(sv2) = rc2 - 1;
895 : }
896 : #else
897 : *--PL_stack_sp = sv;
898 : #endif
899 : }
900 :
901 :
902 : PERL_STATIC_INLINE void
903 : Perl_rpp_replace_2_1_NN(pTHX_ SV *sv)
904 : {
905 : PERL_ARGS_ASSERT_RPP_REPLACE_2_1_NN;
906 :
907 : assert(sv);
908 : #ifdef PERL_RC_STACK
909 : SvREFCNT_inc_simple_void_NN(sv);
910 : #endif
911 : rpp_replace_2_1_COMMON(sv);
912 : }
913 :
914 :
915 : PERL_STATIC_INLINE void
916 : Perl_rpp_replace_2_IMM_NN(pTHX_ SV *sv)
917 : {
918 : PERL_ARGS_ASSERT_RPP_REPLACE_2_IMM_NN;
919 :
920 : assert(sv);
921 : assert(SvIMMORTAL(sv));
922 : rpp_replace_2_1_COMMON(sv);
923 : }
924 :
925 :
926 : /*
927 : =for apidoc rpp_replace_at
928 :
929 : Replace the SV at address sp within the stack with C<sv>, while suitably
930 : adjusting reference counts. Equivalent to C<*sp = sv>, except with proper
931 : reference count handling.
932 :
933 : =cut
934 : */
935 :
936 : PERL_STATIC_INLINE void
937 : Perl_rpp_replace_at(pTHX_ SV **sp, SV *sv)
938 : {
939 : PERL_ARGS_ASSERT_RPP_REPLACE_AT;
940 :
941 : #ifdef PERL_RC_STACK
942 : assert(rpp_stack_is_rc());
943 : SV *oldsv = *sp;
944 : *sp = sv;
945 : SvREFCNT_inc_simple_void_NN(sv);
946 : SvREFCNT_dec(oldsv);
947 : #else
948 : *sp = sv;
949 : #endif
950 : }
951 :
952 :
953 : /*
954 : =for apidoc rpp_replace_at_NN
955 :
956 : A variant of rpp_replace_at() which assumes that the SV pointer on the
957 : stack is non-NULL.
958 :
959 : =cut
960 : */
961 :
962 : PERL_STATIC_INLINE void
963 : Perl_rpp_replace_at_NN(pTHX_ SV **sp, SV *sv)
964 : {
965 : PERL_ARGS_ASSERT_RPP_REPLACE_AT_NN;
966 :
967 : assert(sv);
968 : assert(*sp);
969 : #ifdef PERL_RC_STACK
970 : assert(rpp_stack_is_rc());
971 : SV *oldsv = *sp;
972 : *sp = sv;
973 : SvREFCNT_inc_simple_void_NN(sv);
974 : SvREFCNT_dec_NN(oldsv);
975 : #else
976 : *sp = sv;
977 : #endif
978 : }
979 :
980 :
981 : /*
982 : =for apidoc rpp_replace_at_norc
983 :
984 : Replace the SV at address sp within the stack with C<sv>, while suitably
985 : adjusting the reference count of the old SV. Equivalent to C<*sp = sv>,
986 : except with proper reference count handling.
987 :
988 : C<sv>'s reference count doesn't get incremented. On non-C<PERL_RC_STACK>
989 : builds, it gets mortalised too.
990 :
991 : This is most useful where an SV has just been created and already has a
992 : reference count of 1, but has not yet been anchored anywhere.
993 :
994 : =cut
995 : */
996 :
997 : PERL_STATIC_INLINE void
998 : Perl_rpp_replace_at_norc(pTHX_ SV **sp, SV *sv)
999 : {
1000 : PERL_ARGS_ASSERT_RPP_REPLACE_AT_NORC;
1001 :
1002 : #ifdef PERL_RC_STACK
1003 : assert(rpp_stack_is_rc());
1004 : SV *oldsv = *sp;
1005 : *sp = sv;
1006 : SvREFCNT_dec(oldsv);
1007 : #else
1008 : *sp = sv;
1009 : sv_2mortal(sv);
1010 : #endif
1011 : }
1012 :
1013 :
1014 : /*
1015 : =for apidoc rpp_replace_at_norc_NN
1016 :
1017 : A variant of rpp_replace_at_norc() which assumes that the SV pointer on the
1018 : stack is non-NULL.
1019 :
1020 : =cut
1021 : */
1022 :
1023 : PERL_STATIC_INLINE void
1024 : Perl_rpp_replace_at_norc_NN(pTHX_ SV **sp, SV *sv)
1025 : {
1026 : PERL_ARGS_ASSERT_RPP_REPLACE_AT_NORC_NN;
1027 :
1028 : assert(*sp);
1029 : #ifdef PERL_RC_STACK
1030 : assert(rpp_stack_is_rc());
1031 : SV *oldsv = *sp;
1032 : *sp = sv;
1033 : SvREFCNT_dec_NN(oldsv);
1034 : #else
1035 : *sp = sv;
1036 : sv_2mortal(sv);
1037 : #endif
1038 : }
1039 :
1040 :
1041 : /*
1042 : =for apidoc rpp_context
1043 :
1044 : Impose void, scalar or list context on the stack.
1045 : First, pop C<extra> items off the stack, then when C<gimme> is:
1046 : C<G_LIST>: return as-is.
1047 : C<G_VOID>: pop everything back to C<mark>
1048 : C<G_SCALAR>: move the top stack item (or C<&PL_sv_undef> if none) to
1049 : C<mark+1> and free everything above it.
1050 :
1051 : =cut
1052 : */
1053 :
1054 : PERL_STATIC_INLINE void
1055 : Perl_rpp_context(pTHX_ SV **mark, U8 gimme, SSize_t extra)
1056 : {
1057 : PERL_ARGS_ASSERT_RPP_CONTEXT;
1058 : assert(extra >= 0);
1059 : assert(mark <= PL_stack_sp - extra);
1060 :
1061 : if (gimme == G_LIST)
1062 : mark = PL_stack_sp - extra;
1063 : else if (gimme == G_SCALAR) {
1064 : SV **svp = PL_stack_sp - extra;
1065 : mark++;
1066 : if (mark > svp) {
1067 : /* empty list (plus extra) */
1068 : rpp_popfree_to(svp);
1069 : rpp_extend(1);
1070 : *++PL_stack_sp = &PL_sv_undef;
1071 : return;
1072 : }
1073 : /* swap top and bottom list items */
1074 : SV *top = *svp;
1075 : *svp = *mark;
1076 : *mark = top;
1077 : }
1078 : rpp_popfree_to(mark);
1079 : }
1080 :
1081 :
1082 :
1083 :
1084 : /*
1085 : =for apidoc rpp_try_AMAGIC_1
1086 : =for apidoc_item rpp_try_AMAGIC_2
1087 :
1088 : Check whether either of the one or two SVs at the top of the stack is
1089 : magical or a ref, and in either case handle it specially: invoke get
1090 : magic, call an overload method, or replace a ref with a temporary numeric
1091 : value, as appropriate. If this function returns true, it indicates that
1092 : the correct return value is already on the stack. Intended to be used at
1093 : the beginning of the PP function for unary or binary ops.
1094 :
1095 : =cut
1096 : */
1097 :
1098 : PERL_STATIC_INLINE bool
1099 : Perl_rpp_try_AMAGIC_1(pTHX_ int method, int flags)
1100 : {
1101 : return UNLIKELY((SvFLAGS(*PL_stack_sp) & (SVf_ROK|SVs_GMG)))
1102 : && Perl_try_amagic_un(aTHX_ method, flags);
1103 : }
1104 :
1105 : PERL_STATIC_INLINE bool
1106 : Perl_rpp_try_AMAGIC_2(pTHX_ int method, int flags)
1107 : {
1108 : return UNLIKELY(((SvFLAGS(PL_stack_sp[-1])|SvFLAGS(PL_stack_sp[0]))
1109 : & (SVf_ROK|SVs_GMG)))
1110 : && Perl_try_amagic_bin(aTHX_ method, flags);
1111 : }
1112 :
1113 :
1114 : /*
1115 : =for apidoc rpp_stack_is_rc
1116 :
1117 : Returns a boolean value indicating whether the stack is currently
1118 : reference-counted. Note that if the stack is split (bottom half RC, top
1119 : half non-RC), this function returns false, even if the top half currently
1120 : contains zero items.
1121 :
1122 : =cut
1123 : */
1124 :
1125 : PERL_STATIC_INLINE bool
1126 : Perl_rpp_stack_is_rc(pTHX)
1127 : {
1128 : #ifdef PERL_RC_STACK
1129 : return AvREAL(PL_curstack) && !PL_curstackinfo->si_stack_nonrc_base;
1130 : #else
1131 : return 0;
1132 : #endif
1133 :
1134 : }
1135 :
1136 :
1137 : /*
1138 : =for apidoc rpp_is_lone
1139 :
1140 : Indicates whether the stacked SV C<sv> (assumed to be not yet popped off
1141 : the stack) is only kept alive due to a single reference from the argument
1142 : stack and/or and the temps stack.
1143 :
1144 : This can used for example to decide whether the copying of return values
1145 : in rvalue context can be skipped, or whether it shouldn't be assigned to
1146 : in lvalue context.
1147 :
1148 : =cut
1149 : */
1150 :
1151 :
1152 : PERL_STATIC_INLINE bool
1153 : Perl_rpp_is_lone(pTHX_ SV *sv)
1154 : {
1155 : #ifdef PERL_RC_STACK
1156 : /* note that rpp_is_lone() can be used in wrapped pp functions,
1157 : * where technically the stack is no longer ref-counted; but because
1158 : * the args are non-RC copies of RC args further down the stack, we
1159 : * can't be in a *completely* non-ref stack.
1160 : */
1161 : assert(AvREAL(PL_curstack));
1162 : #endif
1163 :
1164 : return SvREFCNT(sv) <= cBOOL(SvTEMP(sv))
1165 : #ifdef PERL_RC_STACK
1166 : + 1
1167 : && !SvIMMORTAL(sv) /* PL_sv_undef etc are never stealable */
1168 : #endif
1169 : ;
1170 : }
1171 :
1172 :
1173 : /*
1174 : =for apidoc rpp_invoke_xs
1175 :
1176 : Call the XS function associated with C<cv>. Wraps the call if necessary to
1177 : handle XS functions which are not aware of reference-counted stacks.
1178 :
1179 : =cut
1180 : */
1181 :
1182 :
1183 : PERL_STATIC_INLINE void
1184 : Perl_rpp_invoke_xs(pTHX_ CV *cv)
1185 : {
1186 : PERL_ARGS_ASSERT_RPP_INVOKE_XS;
1187 :
1188 : #ifdef PERL_RC_STACK
1189 : if (!CvXS_RCSTACK(cv))
1190 : Perl_xs_wrap(aTHX_ CvXSUB(cv), cv);
1191 : else
1192 : #endif
1193 : CvXSUB(cv)(aTHX_ cv);
1194 : }
1195 :
1196 :
1197 :
1198 :
1199 : /* ----------------------------- regexp.h ----------------------------- */
1200 :
1201 : /* PVLVs need to act as a superset of all scalar types - they are basically
1202 : * PVMGs with a few extra fields.
1203 : * REGEXPs are first class scalars, but have many fields that can't be copied
1204 : * into a PVLV body.
1205 : *
1206 : * Hence we take a different approach - instead of a copy, PVLVs store a pointer
1207 : * back to the original body. To avoid increasing the size of PVLVs just for the
1208 : * rare case of REGEXP assignment, this pointer is stored in the memory usually
1209 : * used for SvLEN(). Hence the check for SVt_PVLV below, and the ? : ternary to
1210 : * read the pointer from the two possible locations. The macro SvLEN() wraps the
1211 : * access to the union's member xpvlenu_len, but there is no equivalent macro
1212 : * for wrapping the union's member xpvlenu_rx, hence the direct reference here.
1213 : *
1214 : * See commit df6b4bd56551f2d3 for more details. */
1215 :
1216 : PERL_STATIC_INLINE struct regexp *
1217 : Perl_ReANY(const REGEXP * const re)
1218 : {
1219 : XPV* const p = (XPV*)SvANY(re);
1220 :
1221 : PERL_ARGS_ASSERT_REANY;
1222 : assert(isREGEXP(re));
1223 :
1224 : return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
1225 : : (struct regexp *)p;
1226 : }
1227 :
1228 : /* ------------------------------- utf8.h ------------------------------- */
1229 :
1230 : /*
1231 : =for apidoc_section $unicode
1232 : */
1233 :
1234 : PERL_STATIC_INLINE void
1235 : Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
1236 : {
1237 : /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
1238 : * encoded string at '*dest', updating '*dest' to include it */
1239 :
1240 : PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
1241 :
1242 : if (NATIVE_BYTE_IS_INVARIANT(byte))
1243 : *((*dest)++) = byte;
1244 : else {
1245 : *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
1246 : *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
1247 : }
1248 : }
1249 :
1250 : /*
1251 : =for apidoc valid_utf8_to_uvchr
1252 : Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
1253 : known that the next character in the input UTF-8 string C<s> is well-formed
1254 : (I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
1255 : points, and non-Unicode code points are allowed.
1256 :
1257 : =cut
1258 :
1259 : */
1260 :
1261 : PERL_STATIC_INLINE UV
1262 : Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
1263 : {
1264 : const UV expectlen = UTF8SKIP(s);
1265 : const U8* send = s + expectlen;
1266 : UV uv = *s;
1267 :
1268 : PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
1269 :
1270 : if (retlen) {
1271 : *retlen = expectlen;
1272 : }
1273 :
1274 : /* An invariant is trivially returned */
1275 : if (expectlen == 1) {
1276 : return uv;
1277 : }
1278 :
1279 : /* Remove the leading bits that indicate the number of bytes, leaving just
1280 : * the bits that are part of the value */
1281 : uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
1282 :
1283 : /* Now, loop through the remaining bytes, accumulating each into the
1284 : * working total as we go. (I khw tried unrolling the loop for up to 4
1285 : * bytes, but there was no performance improvement) */
1286 : for (++s; s < send; s++) {
1287 : uv = UTF8_ACCUMULATE(uv, *s);
1288 : }
1289 :
1290 : return UNI_TO_NATIVE(uv);
1291 :
1292 : }
1293 :
1294 : /*
1295 : =for apidoc is_utf8_invariant_string
1296 :
1297 : Returns TRUE if the first C<len> bytes of the string C<s> are the same
1298 : regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
1299 : EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
1300 : are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
1301 : the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
1302 : characters are invariant, but so also are the C1 controls.
1303 :
1304 : If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
1305 : use this option, that C<s> can't have embedded C<NUL> characters and has to
1306 : have a terminating C<NUL> byte).
1307 :
1308 : See also
1309 : C<L</is_utf8_string>>,
1310 : C<L</is_utf8_string_flags>>,
1311 : C<L</is_utf8_string_loc>>,
1312 : C<L</is_utf8_string_loc_flags>>,
1313 : C<L</is_utf8_string_loclen>>,
1314 : C<L</is_utf8_string_loclen_flags>>,
1315 : C<L</is_utf8_fixed_width_buf_flags>>,
1316 : C<L</is_utf8_fixed_width_buf_loc_flags>>,
1317 : C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1318 : C<L</is_strict_utf8_string>>,
1319 : C<L</is_strict_utf8_string_loc>>,
1320 : C<L</is_strict_utf8_string_loclen>>,
1321 : C<L</is_c9strict_utf8_string>>,
1322 : C<L</is_c9strict_utf8_string_loc>>,
1323 : and
1324 : C<L</is_c9strict_utf8_string_loclen>>.
1325 :
1326 : =cut
1327 :
1328 : */
1329 :
1330 : #define is_utf8_invariant_string(s, len) \
1331 : is_utf8_invariant_string_loc(s, len, NULL)
1332 :
1333 : /*
1334 : =for apidoc is_utf8_invariant_string_loc
1335 :
1336 : Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
1337 : the first UTF-8 variant character in the C<ep> pointer; if all characters are
1338 : UTF-8 invariant, this function does not change the contents of C<*ep>.
1339 :
1340 : =cut
1341 :
1342 : */
1343 :
1344 : PERL_STATIC_INLINE bool
1345 : Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
1346 : {
1347 : const U8* send;
1348 : const U8* x = s;
1349 :
1350 : PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
1351 :
1352 : if (len == 0) {
1353 : len = strlen((const char *)s);
1354 : }
1355 :
1356 : send = s + len;
1357 :
1358 : /* This looks like 0x010101... */
1359 : # define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
1360 :
1361 : /* This looks like 0x808080... */
1362 : # define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
1363 : # define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
1364 : # define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
1365 :
1366 : /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
1367 : * or'ing together the lowest bits of 'x'. Hopefully the final term gets
1368 : * optimized out completely on a 32-bit system, and its mask gets optimized out
1369 : * on a 64-bit system */
1370 : # define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
1371 : | ( PTR2nat(x) >> 1) \
1372 : | ( ( (PTR2nat(x) \
1373 : & PERL_WORD_BOUNDARY_MASK) >> 2))))
1374 :
1375 : #ifndef EBCDIC
1376 :
1377 : /* Do the word-at-a-time iff there is at least one usable full word. That
1378 : * means that after advancing to a word boundary, there still is at least a
1379 : * full word left. The number of bytes needed to advance is 'wordsize -
1380 : * offset' unless offset is 0. */
1381 : if ((STRLEN) (send - x) >= PERL_WORDSIZE
1382 :
1383 : /* This term is wordsize if subword; 0 if not */
1384 : + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
1385 :
1386 : /* 'offset' */
1387 : - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
1388 : {
1389 :
1390 : /* Process per-byte until reach word boundary. XXX This loop could be
1391 : * eliminated if we knew that this platform had fast unaligned reads */
1392 : while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
1393 : if (! UTF8_IS_INVARIANT(*x)) {
1394 : if (ep) {
1395 : *ep = x;
1396 : }
1397 :
1398 : return FALSE;
1399 : }
1400 : x++;
1401 : }
1402 :
1403 : /* Here, we know we have at least one full word to process. Process
1404 : * per-word as long as we have at least a full word left */
1405 : do {
1406 : if ((* (const PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
1407 :
1408 : /* Found a variant. Just return if caller doesn't want its
1409 : * exact position */
1410 : if (! ep) {
1411 : return FALSE;
1412 : }
1413 :
1414 : # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
1415 : || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1416 :
1417 : *ep = x + variant_byte_number(* (const PERL_UINTMAX_T *) x);
1418 : assert(*ep >= s && *ep < send);
1419 :
1420 : return FALSE;
1421 :
1422 : # else /* If weird byte order, drop into next loop to do byte-at-a-time
1423 : checks. */
1424 :
1425 : break;
1426 : # endif
1427 : }
1428 :
1429 : x += PERL_WORDSIZE;
1430 :
1431 : } while (x + PERL_WORDSIZE <= send);
1432 : }
1433 :
1434 : #endif /* End of ! EBCDIC */
1435 :
1436 : /* Process per-byte */
1437 : while (x < send) {
1438 : if (! UTF8_IS_INVARIANT(*x)) {
1439 : if (ep) {
1440 : *ep = x;
1441 : }
1442 :
1443 : return FALSE;
1444 : }
1445 :
1446 : x++;
1447 : }
1448 :
1449 : return TRUE;
1450 : }
1451 :
1452 : /* See if the platform has builtins for finding the most/least significant bit,
1453 : * and which one is right for using on 32 and 64 bit operands */
1454 : #if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0))
1455 : # if U32SIZE == INTSIZE
1456 : # define PERL_CLZ_32 __builtin_clz
1457 : # endif
1458 : # if defined(U64TYPE) && U64SIZE == INTSIZE
1459 : # define PERL_CLZ_64 __builtin_clz
1460 : # endif
1461 : #endif
1462 : #if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0))
1463 : # if U32SIZE == INTSIZE
1464 : # define PERL_CTZ_32 __builtin_ctz
1465 : # endif
1466 : # if defined(U64TYPE) && U64SIZE == INTSIZE
1467 : # define PERL_CTZ_64 __builtin_ctz
1468 : # endif
1469 : #endif
1470 :
1471 : #if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0))
1472 : # if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32)
1473 : # define PERL_CLZ_32 __builtin_clzl
1474 : # endif
1475 : # if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64)
1476 : # define PERL_CLZ_64 __builtin_clzl
1477 : # endif
1478 : #endif
1479 : #if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0))
1480 : # if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32)
1481 : # define PERL_CTZ_32 __builtin_ctzl
1482 : # endif
1483 : # if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64)
1484 : # define PERL_CTZ_64 __builtin_ctzl
1485 : # endif
1486 : #endif
1487 :
1488 : #if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0))
1489 : # if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32)
1490 : # define PERL_CLZ_32 __builtin_clzll
1491 : # endif
1492 : # if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64)
1493 : # define PERL_CLZ_64 __builtin_clzll
1494 : # endif
1495 : #endif
1496 : #if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0))
1497 : # if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32)
1498 : # define PERL_CTZ_32 __builtin_ctzll
1499 : # endif
1500 : # if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64)
1501 : # define PERL_CTZ_64 __builtin_ctzll
1502 : # endif
1503 : #endif
1504 :
1505 : #if defined(WIN32)
1506 : # include <intrin.h>
1507 : /* MinGW warns that it ignores "pragma intrinsic". */
1508 : # if defined(_MSC_VER)
1509 : # pragma intrinsic(_BitScanForward)
1510 : # pragma intrinsic(_BitScanReverse)
1511 : # if defined(_WIN64)
1512 : # pragma intrinsic(_BitScanForward64)
1513 : # pragma intrinsic(_BitScanReverse64)
1514 : # endif
1515 : # endif
1516 : #endif
1517 :
1518 : /* The reason there are not checks to see if ffs() and ffsl() are available for
1519 : * determining the lsb, is because these don't improve on the deBruijn method
1520 : * fallback, which is just a branchless integer multiply, array element
1521 : * retrieval, and shift. The others, even if the function call overhead is
1522 : * optimized out, have to cope with the possibility of the input being all
1523 : * zeroes, and almost certainly will have conditionals for this eventuality.
1524 : * khw, at the time of this commit, looked at the source for both gcc and clang
1525 : * to verify this. (gcc used a method inferior to deBruijn.) */
1526 :
1527 : /* Below are functions to find the first, last, or only set bit in a word. On
1528 : * platforms with 64-bit capability, there is a pair for each operation; the
1529 : * first taking a 64 bit operand, and the second a 32 bit one. The logic is
1530 : * the same in each pair, so the second is stripped of most comments. */
1531 :
1532 : #ifdef U64TYPE /* HAS_QUAD not usable outside the core */
1533 :
1534 : PERL_STATIC_INLINE unsigned
1535 : Perl_lsbit_pos64(U64 word)
1536 : {
1537 : /* Find the position (0..63) of the least significant set bit in the input
1538 : * word */
1539 :
1540 : ASSUME(word != 0);
1541 :
1542 : /* If we can determine that the platform has a usable fast method to get
1543 : * this info, use that */
1544 :
1545 : # if defined(PERL_CTZ_64)
1546 : # define PERL_HAS_FAST_GET_LSB_POS64
1547 :
1548 : return (unsigned) PERL_CTZ_64(word);
1549 :
1550 : # elif U64SIZE == 8 && defined(_WIN64)
1551 : # define PERL_HAS_FAST_GET_LSB_POS64
1552 :
1553 : {
1554 : unsigned long index;
1555 : _BitScanForward64(&index, word);
1556 : return (unsigned)index;
1557 : }
1558 :
1559 : # else
1560 :
1561 : /* Here, we didn't find a fast method for finding the lsb. Fall back to
1562 : * making the lsb the only set bit in the word, and use our function that
1563 : * works on words with a single bit set.
1564 : *
1565 : * Isolate the lsb;
1566 : * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
1567 : *
1568 : * The word will look like this, with a rightmost set bit in position 's':
1569 : * ('x's are don't cares, and 'y's are their complements)
1570 : * s
1571 : * x..x100..00
1572 : * y..y011..11 Complement
1573 : * y..y100..00 Add 1
1574 : * 0..0100..00 And with the original
1575 : *
1576 : * (Yes, complementing and adding 1 is just taking the negative on 2's
1577 : * complement machines, but not on 1's complement ones, and some compilers
1578 : * complain about negating an unsigned.)
1579 : */
1580 : return single_1bit_pos64(word & (~word + 1));
1581 :
1582 : # endif
1583 :
1584 : }
1585 :
1586 : # define lsbit_pos_uintmax_(word) lsbit_pos64(word)
1587 : #else /* ! QUAD */
1588 : # define lsbit_pos_uintmax_(word) lsbit_pos32(word)
1589 : #endif
1590 :
1591 : PERL_STATIC_INLINE unsigned /* Like above for 32 bit word */
1592 : Perl_lsbit_pos32(U32 word)
1593 : {
1594 : /* Find the position (0..31) of the least significant set bit in the input
1595 : * word */
1596 :
1597 : ASSUME(word != 0);
1598 :
1599 : #if defined(PERL_CTZ_32)
1600 : # define PERL_HAS_FAST_GET_LSB_POS32
1601 :
1602 : return (unsigned) PERL_CTZ_32(word);
1603 :
1604 : #elif U32SIZE == 4 && defined(WIN32)
1605 : # define PERL_HAS_FAST_GET_LSB_POS32
1606 :
1607 : {
1608 : unsigned long index;
1609 : _BitScanForward(&index, word);
1610 : return (unsigned)index;
1611 : }
1612 :
1613 : #elif defined(PERL_HAS_FAST_GET_LSB_POS64)
1614 : # define PERL_HAS_FAST_GET_LSB_POS32
1615 :
1616 : /* Unlikely, but possible for the platform to have a wider fast operation
1617 : * but not a narrower one. But easy enough to handle the case by widening
1618 : * the parameter size. */
1619 : return lsbit_pos64(word);
1620 :
1621 : #else
1622 :
1623 : return single_1bit_pos32(word & (~word + 1));
1624 :
1625 : #endif
1626 :
1627 : }
1628 :
1629 :
1630 : /* Convert the leading zeros count to the bit position of the first set bit.
1631 : * This just subtracts from the highest position, 31 or 63. But some compilers
1632 : * don't optimize this optimally, and so a bit of bit twiddling encourages them
1633 : * to do the right thing. It turns out that subtracting a smaller non-negative
1634 : * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of
1635 : * the two numbers. To see why, first note that the sum of any number, x, and
1636 : * its complement, x', is all ones. So all ones minus x is x'. Then note that
1637 : * the xor of x and all ones is x'. */
1638 : #define LZC_TO_MSBIT_POS_(size, lzc) ((size##SIZE * CHARBITS - 1) ^ (lzc))
1639 :
1640 : #ifdef U64TYPE /* HAS_QUAD not usable outside the core */
1641 :
1642 : PERL_STATIC_INLINE unsigned
1643 : Perl_msbit_pos64(U64 word)
1644 : {
1645 : /* Find the position (0..63) of the most significant set bit in the input
1646 : * word */
1647 :
1648 : ASSUME(word != 0);
1649 :
1650 : /* If we can determine that the platform has a usable fast method to get
1651 : * this, use that */
1652 :
1653 : # if defined(PERL_CLZ_64)
1654 : # define PERL_HAS_FAST_GET_MSB_POS64
1655 :
1656 : return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word));
1657 :
1658 : # elif U64SIZE == 8 && defined(_WIN64)
1659 : # define PERL_HAS_FAST_GET_MSB_POS64
1660 :
1661 : {
1662 : unsigned long index;
1663 : _BitScanReverse64(&index, word);
1664 : return (unsigned)index;
1665 : }
1666 :
1667 : # else
1668 :
1669 : /* Here, we didn't find a fast method for finding the msb. Fall back to
1670 : * making the msb the only set bit in the word, and use our function that
1671 : * works on words with a single bit set.
1672 : *
1673 : * Isolate the msb; http://codeforces.com/blog/entry/10330
1674 : *
1675 : * Only the most significant set bit matters. Or'ing word with its right
1676 : * shift of 1 makes that bit and the next one to its right both 1.
1677 : * Repeating that with the right shift of 2 makes for 4 1-bits in a row.
1678 : * ... We end with the msb and all to the right being 1. */
1679 : word |= (word >> 1);
1680 : word |= (word >> 2);
1681 : word |= (word >> 4);
1682 : word |= (word >> 8);
1683 : word |= (word >> 16);
1684 : word |= (word >> 32);
1685 :
1686 : /* Then subtracting the right shift by 1 clears all but the left-most of
1687 : * the 1 bits, which is our desired result */
1688 : word -= (word >> 1);
1689 :
1690 : /* Now we have a single bit set */
1691 : return single_1bit_pos64(word);
1692 :
1693 : # endif
1694 :
1695 : }
1696 :
1697 : # define msbit_pos_uintmax_(word) msbit_pos64(word)
1698 : #else /* ! QUAD */
1699 : # define msbit_pos_uintmax_(word) msbit_pos32(word)
1700 : #endif
1701 :
1702 : PERL_STATIC_INLINE unsigned
1703 : Perl_msbit_pos32(U32 word)
1704 : {
1705 : /* Find the position (0..31) of the most significant set bit in the input
1706 : * word */
1707 :
1708 : ASSUME(word != 0);
1709 :
1710 : #if defined(PERL_CLZ_32)
1711 : # define PERL_HAS_FAST_GET_MSB_POS32
1712 :
1713 : return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word));
1714 : #elif U32SIZE == 4 && defined(WIN32)
1715 : # define PERL_HAS_FAST_GET_MSB_POS32
1716 :
1717 : {
1718 : unsigned long index;
1719 : _BitScanReverse(&index, word);
1720 : return (unsigned)index;
1721 : }
1722 :
1723 : #elif defined(PERL_HAS_FAST_GET_MSB_POS64)
1724 : # define PERL_HAS_FAST_GET_MSB_POS32
1725 :
1726 : return msbit_pos64(word); /* Let compiler widen parameter */
1727 :
1728 : #else
1729 :
1730 : word |= (word >> 1);
1731 : word |= (word >> 2);
1732 : word |= (word >> 4);
1733 : word |= (word >> 8);
1734 : word |= (word >> 16);
1735 : word -= (word >> 1);
1736 : return single_1bit_pos32(word);
1737 :
1738 : #endif
1739 :
1740 : }
1741 :
1742 : /* Note that if you are working through all the 1 bits in a word, and don't
1743 : * care which order you process them in, it is better to use lsbit_pos. This
1744 : * is because some platforms have a fast way to find the msb but not the lsb,
1745 : * and others vice versa. The code above falls back to use the single
1746 : * available fast method when the desired one is missing, and it is cheaper to
1747 : * fall back from lsb to msb than the other way around */
1748 :
1749 : #if UVSIZE == U64SIZE
1750 : # define msbit_pos(word) msbit_pos64(word)
1751 : # define lsbit_pos(word) lsbit_pos64(word)
1752 : #elif UVSIZE == U32SIZE
1753 : # define msbit_pos(word) msbit_pos32(word)
1754 : # define lsbit_pos(word) lsbit_pos32(word)
1755 : #endif
1756 :
1757 : #ifdef U64TYPE /* HAS_QUAD not usable outside the core */
1758 :
1759 : PERL_STATIC_INLINE unsigned
1760 : Perl_single_1bit_pos64(U64 word)
1761 : {
1762 : /* Given a 64-bit word known to contain all zero bits except one 1 bit,
1763 : * find and return the 1's position: 0..63 */
1764 :
1765 : # ifdef PERL_CORE /* macro not exported */
1766 : ASSUME(isPOWER_OF_2(word));
1767 : # else
1768 : ASSUME(word && (word & (word-1)) == 0);
1769 : # endif
1770 :
1771 : /* The only set bit is both the most and least significant bit. If we have
1772 : * a fast way of finding either one, use that.
1773 : *
1774 : * It may appear at first glance that those functions call this one, but
1775 : * they don't if the corresponding #define is set */
1776 :
1777 : # ifdef PERL_HAS_FAST_GET_MSB_POS64
1778 :
1779 : return msbit_pos64(word);
1780 :
1781 : # elif defined(PERL_HAS_FAST_GET_LSB_POS64)
1782 :
1783 : return lsbit_pos64(word);
1784 :
1785 : # else
1786 :
1787 : /* The position of the only set bit in a word can be quickly calculated
1788 : * using deBruijn sequences. See for example
1789 : * https://en.wikipedia.org/wiki/De_Bruijn_sequence */
1790 : return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_)
1791 : >> PERL_deBruijnShift64_];
1792 : # endif
1793 :
1794 : }
1795 :
1796 : #endif
1797 :
1798 : PERL_STATIC_INLINE unsigned
1799 : Perl_single_1bit_pos32(U32 word)
1800 : {
1801 : /* Given a 32-bit word known to contain all zero bits except one 1 bit,
1802 : * find and return the 1's position: 0..31 */
1803 :
1804 : #ifdef PERL_CORE /* macro not exported */
1805 : ASSUME(isPOWER_OF_2(word));
1806 : #else
1807 : ASSUME(word && (word & (word-1)) == 0);
1808 : #endif
1809 : #ifdef PERL_HAS_FAST_GET_MSB_POS32
1810 :
1811 : return msbit_pos32(word);
1812 :
1813 : #elif defined(PERL_HAS_FAST_GET_LSB_POS32)
1814 :
1815 : return lsbit_pos32(word);
1816 :
1817 : #else
1818 :
1819 : return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_)
1820 : >> PERL_deBruijnShift32_];
1821 : #endif
1822 :
1823 : }
1824 :
1825 : #ifndef EBCDIC
1826 :
1827 : PERL_STATIC_INLINE unsigned int
1828 : Perl_variant_byte_number(PERL_UINTMAX_T word)
1829 : {
1830 : /* This returns the position in a word (0..7) of the first variant byte in
1831 : * it. This is a helper function. Note that there are no branches */
1832 :
1833 : /* Get just the msb bits of each byte */
1834 : word &= PERL_VARIANTS_WORD_MASK;
1835 :
1836 : /* This should only be called if we know there is a variant byte in the
1837 : * word */
1838 : assert(word);
1839 :
1840 : # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1841 :
1842 : /* Bytes are stored like
1843 : * Byte8 ... Byte2 Byte1
1844 : * 63..56...15...8 7...0
1845 : * so getting the lsb of the whole modified word is getting the msb of the
1846 : * first byte that has its msb set */
1847 : word = lsbit_pos_uintmax_(word);
1848 :
1849 : /* Here, word contains the position 7,15,23,...55,63 of that bit. Convert
1850 : * to 0..7 */
1851 : return (unsigned int) ((word + 1) >> 3) - 1;
1852 :
1853 : # elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1854 :
1855 : /* Bytes are stored like
1856 : * Byte1 Byte2 ... Byte8
1857 : * 63..56 55..47 ... 7...0
1858 : * so getting the msb of the whole modified word is getting the msb of the
1859 : * first byte that has its msb set */
1860 : word = msbit_pos_uintmax_(word);
1861 :
1862 : /* Here, word contains the position 63,55,...,23,15,7 of that bit. Convert
1863 : * to 0..7 */
1864 : word = ((word + 1) >> 3) - 1;
1865 :
1866 : /* And invert the result because of the reversed byte order on this
1867 : * platform */
1868 : word = CHARBITS - word - 1;
1869 :
1870 : return (unsigned int) word;
1871 :
1872 : # else
1873 : # error Unexpected byte order
1874 : # endif
1875 :
1876 : }
1877 :
1878 : #endif
1879 : #if defined(PERL_CORE) || defined(PERL_EXT)
1880 :
1881 : /*
1882 : =for apidoc variant_under_utf8_count
1883 :
1884 : This function looks at the sequence of bytes between C<s> and C<e>, which are
1885 : assumed to be encoded in ASCII/Latin1, and returns how many of them would
1886 : change should the string be translated into UTF-8. Due to the nature of UTF-8,
1887 : each of these would occupy two bytes instead of the single one in the input
1888 : string. Thus, this function returns the precise number of bytes the string
1889 : would expand by when translated to UTF-8.
1890 :
1891 : Unlike most of the other functions that have C<utf8> in their name, the input
1892 : to this function is NOT a UTF-8-encoded string. The function name is slightly
1893 : I<odd> to emphasize this.
1894 :
1895 : This function is internal to Perl because khw thinks that any XS code that
1896 : would want this is probably operating too close to the internals. Presenting a
1897 : valid use case could change that.
1898 :
1899 : See also
1900 : C<L<perlapi/is_utf8_invariant_string>>
1901 : and
1902 : C<L<perlapi/is_utf8_invariant_string_loc>>,
1903 :
1904 : =cut
1905 :
1906 : */
1907 :
1908 : PERL_STATIC_INLINE Size_t
1909 : S_variant_under_utf8_count(const U8* const s, const U8* const e)
1910 : {
1911 : const U8* x = s;
1912 : Size_t count = 0;
1913 :
1914 : PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
1915 :
1916 : # ifndef EBCDIC
1917 :
1918 : /* Test if the string is long enough to use word-at-a-time. (Logic is the
1919 : * same as for is_utf8_invariant_string()) */
1920 : if ((STRLEN) (e - x) >= PERL_WORDSIZE
1921 : + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
1922 : - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
1923 : {
1924 :
1925 : /* Process per-byte until reach word boundary. XXX This loop could be
1926 : * eliminated if we knew that this platform had fast unaligned reads */
1927 : while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
1928 : count += ! UTF8_IS_INVARIANT(*x++);
1929 : }
1930 :
1931 : /* Process per-word as long as we have at least a full word left */
1932 : do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
1933 : explanation of how this works */
1934 : PERL_UINTMAX_T increment
1935 : = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
1936 : * PERL_COUNT_MULTIPLIER)
1937 : >> ((PERL_WORDSIZE - 1) * CHARBITS);
1938 : count += (Size_t) increment;
1939 : x += PERL_WORDSIZE;
1940 : } while (x + PERL_WORDSIZE <= e);
1941 : }
1942 :
1943 : # endif
1944 :
1945 : /* Process per-byte */
1946 : while (x < e) {
1947 : if (! UTF8_IS_INVARIANT(*x)) {
1948 : count++;
1949 : }
1950 :
1951 : x++;
1952 : }
1953 :
1954 : return count;
1955 : }
1956 :
1957 : #endif
1958 :
1959 : /* Keep these around for these files */
1960 : #if ! defined(PERL_IN_REGEXEC_C) && ! defined(PERL_IN_UTF8_C)
1961 : # undef PERL_WORDSIZE
1962 : # undef PERL_COUNT_MULTIPLIER
1963 : # undef PERL_WORD_BOUNDARY_MASK
1964 : # undef PERL_VARIANTS_WORD_MASK
1965 : #endif
1966 :
1967 : /*
1968 : =for apidoc is_utf8_string
1969 :
1970 : Returns TRUE if the first C<len> bytes of string C<s> form a valid
1971 : Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
1972 : be calculated using C<strlen(s)> (which means if you use this option, that C<s>
1973 : can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1974 : byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1975 :
1976 : This function considers Perl's extended UTF-8 to be valid. That means that
1977 : code points above Unicode, surrogates, and non-character code points are
1978 : considered valid by this function. Use C<L</is_strict_utf8_string>>,
1979 : C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
1980 : code points are considered valid.
1981 :
1982 : See also
1983 : C<L</is_utf8_invariant_string>>,
1984 : C<L</is_utf8_invariant_string_loc>>,
1985 : C<L</is_utf8_string_loc>>,
1986 : C<L</is_utf8_string_loclen>>,
1987 : C<L</is_utf8_fixed_width_buf_flags>>,
1988 : C<L</is_utf8_fixed_width_buf_loc_flags>>,
1989 : C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1990 :
1991 : =cut
1992 : */
1993 :
1994 : #define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
1995 :
1996 : #if defined(PERL_CORE) || defined (PERL_EXT)
1997 :
1998 : /*
1999 : =for apidoc is_utf8_non_invariant_string
2000 :
2001 : Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
2002 : C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
2003 : UTF-8; otherwise returns FALSE.
2004 :
2005 : A TRUE return means that at least one code point represented by the sequence
2006 : either is a wide character not representable as a single byte, or the
2007 : representation differs depending on whether the sequence is encoded in UTF-8 or
2008 : not.
2009 :
2010 : See also
2011 : C<L<perlapi/is_utf8_invariant_string>>,
2012 : C<L<perlapi/is_utf8_string>>
2013 :
2014 : =cut
2015 :
2016 : This is commonly used to determine if a SV's UTF-8 flag should be turned on.
2017 : It generally needn't be if its string is entirely UTF-8 invariant, and it
2018 : shouldn't be if it otherwise contains invalid UTF-8.
2019 :
2020 : It is an internal function because khw thinks that XS code shouldn't be working
2021 : at this low a level. A valid use case could change that.
2022 :
2023 : */
2024 :
2025 : PERL_STATIC_INLINE bool
2026 : Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
2027 : {
2028 : const U8 * first_variant;
2029 :
2030 : PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
2031 :
2032 : if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
2033 : return FALSE;
2034 : }
2035 :
2036 : return is_utf8_string(first_variant, len - (first_variant - s));
2037 : }
2038 :
2039 : #endif
2040 :
2041 : /*
2042 : =for apidoc is_strict_utf8_string
2043 :
2044 : Returns TRUE if the first C<len> bytes of string C<s> form a valid
2045 : UTF-8-encoded string that is fully interchangeable by any application using
2046 : Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
2047 : calculated using C<strlen(s)> (which means if you use this option, that C<s>
2048 : can't have embedded C<NUL> characters and has to have a terminating C<NUL>
2049 : byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
2050 :
2051 : This function returns FALSE for strings containing any
2052 : code points above the Unicode max of 0x10FFFF, surrogate code points, or
2053 : non-character code points.
2054 :
2055 : See also
2056 : C<L</is_utf8_invariant_string>>,
2057 : C<L</is_utf8_invariant_string_loc>>,
2058 : C<L</is_utf8_string>>,
2059 : C<L</is_utf8_string_flags>>,
2060 : C<L</is_utf8_string_loc>>,
2061 : C<L</is_utf8_string_loc_flags>>,
2062 : C<L</is_utf8_string_loclen>>,
2063 : C<L</is_utf8_string_loclen_flags>>,
2064 : C<L</is_utf8_fixed_width_buf_flags>>,
2065 : C<L</is_utf8_fixed_width_buf_loc_flags>>,
2066 : C<L</is_utf8_fixed_width_buf_loclen_flags>>,
2067 : C<L</is_strict_utf8_string_loc>>,
2068 : C<L</is_strict_utf8_string_loclen>>,
2069 : C<L</is_c9strict_utf8_string>>,
2070 : C<L</is_c9strict_utf8_string_loc>>,
2071 : and
2072 : C<L</is_c9strict_utf8_string_loclen>>.
2073 :
2074 : =cut
2075 : */
2076 :
2077 : #define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
2078 :
2079 : /*
2080 : =for apidoc is_c9strict_utf8_string
2081 :
2082 : Returns TRUE if the first C<len> bytes of string C<s> form a valid
2083 : UTF-8-encoded string that conforms to
2084 : L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
2085 : otherwise it returns FALSE. If C<len> is 0, it will be calculated using
2086 : C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
2087 : C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
2088 : characters being ASCII constitute 'a valid UTF-8 string'.
2089 :
2090 : This function returns FALSE for strings containing any code points above the
2091 : Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
2092 : code points per
2093 : L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
2094 :
2095 : See also
2096 : C<L</is_utf8_invariant_string>>,
2097 : C<L</is_utf8_invariant_string_loc>>,
2098 : C<L</is_utf8_string>>,
2099 : C<L</is_utf8_string_flags>>,
2100 : C<L</is_utf8_string_loc>>,
2101 : C<L</is_utf8_string_loc_flags>>,
2102 : C<L</is_utf8_string_loclen>>,
2103 : C<L</is_utf8_string_loclen_flags>>,
2104 : C<L</is_utf8_fixed_width_buf_flags>>,
2105 : C<L</is_utf8_fixed_width_buf_loc_flags>>,
2106 : C<L</is_utf8_fixed_width_buf_loclen_flags>>,
2107 : C<L</is_strict_utf8_string>>,
2108 : C<L</is_strict_utf8_string_loc>>,
2109 : C<L</is_strict_utf8_string_loclen>>,
2110 : C<L</is_c9strict_utf8_string_loc>>,
2111 : and
2112 : C<L</is_c9strict_utf8_string_loclen>>.
2113 :
2114 : =cut
2115 : */
2116 :
2117 : #define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
2118 :
2119 : /*
2120 : =for apidoc is_utf8_string_flags
2121 :
2122 : Returns TRUE if the first C<len> bytes of string C<s> form a valid
2123 : UTF-8 string, subject to the restrictions imposed by C<flags>;
2124 : returns FALSE otherwise. If C<len> is 0, it will be calculated
2125 : using C<strlen(s)> (which means if you use this option, that C<s> can't have
2126 : embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
2127 : that all characters being ASCII constitute 'a valid UTF-8 string'.
2128 :
2129 : If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
2130 : C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
2131 : as C<L</is_strict_utf8_string>>; and if C<flags> is
2132 : C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
2133 : C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
2134 : combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
2135 : C<L</utf8n_to_uvchr>>, with the same meanings.
2136 :
2137 : See also
2138 : C<L</is_utf8_invariant_string>>,
2139 : C<L</is_utf8_invariant_string_loc>>,
2140 : C<L</is_utf8_string>>,
2141 : C<L</is_utf8_string_loc>>,
2142 : C<L</is_utf8_string_loc_flags>>,
2143 : C<L</is_utf8_string_loclen>>,
2144 : C<L</is_utf8_string_loclen_flags>>,
2145 : C<L</is_utf8_fixed_width_buf_flags>>,
2146 : C<L</is_utf8_fixed_width_buf_loc_flags>>,
2147 : C<L</is_utf8_fixed_width_buf_loclen_flags>>,
2148 : C<L</is_strict_utf8_string>>,
2149 : C<L</is_strict_utf8_string_loc>>,
2150 : C<L</is_strict_utf8_string_loclen>>,
2151 : C<L</is_c9strict_utf8_string>>,
2152 : C<L</is_c9strict_utf8_string_loc>>,
2153 : and
2154 : C<L</is_c9strict_utf8_string_loclen>>.
2155 :
2156 : =cut
2157 : */
2158 :
2159 : PERL_STATIC_INLINE bool
2160 : Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
2161 : {
2162 : const U8 * first_variant;
2163 :
2164 : PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
2165 : assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2166 : |UTF8_DISALLOW_PERL_EXTENDED)));
2167 :
2168 : if (len == 0) {
2169 : len = strlen((const char *)s);
2170 : }
2171 :
2172 : if (flags == 0) {
2173 : return is_utf8_string(s, len);
2174 : }
2175 :
2176 : if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
2177 : == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
2178 : {
2179 : return is_strict_utf8_string(s, len);
2180 : }
2181 :
2182 : if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
2183 : == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
2184 : {
2185 : return is_c9strict_utf8_string(s, len);
2186 : }
2187 :
2188 : if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
2189 : const U8* const send = s + len;
2190 : const U8* x = first_variant;
2191 :
2192 : while (x < send) {
2193 : STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
2194 : if (UNLIKELY(! cur_len)) {
2195 : return FALSE;
2196 : }
2197 : x += cur_len;
2198 : }
2199 : }
2200 :
2201 : return TRUE;
2202 : }
2203 :
2204 : /*
2205 :
2206 : =for apidoc is_utf8_string_loc
2207 :
2208 : Like C<L</is_utf8_string>> but stores the location of the failure (in the
2209 : case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2210 : "utf8ness success") in the C<ep> pointer.
2211 :
2212 : See also C<L</is_utf8_string_loclen>>.
2213 :
2214 : =cut
2215 : */
2216 :
2217 : #define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
2218 :
2219 : /*
2220 :
2221 : =for apidoc is_utf8_string_loclen
2222 :
2223 : Like C<L</is_utf8_string>> but stores the location of the failure (in the
2224 : case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2225 : "utf8ness success") in the C<ep> pointer, and the number of UTF-8
2226 : encoded characters in the C<el> pointer.
2227 :
2228 : See also C<L</is_utf8_string_loc>>.
2229 :
2230 : =cut
2231 : */
2232 :
2233 : PERL_STATIC_INLINE bool
2234 : Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
2235 : {
2236 : const U8 * first_variant;
2237 :
2238 : PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
2239 :
2240 : if (len == 0) {
2241 : len = strlen((const char *) s);
2242 : }
2243 :
2244 : if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
2245 : if (el)
2246 : *el = len;
2247 :
2248 : if (ep) {
2249 : *ep = s + len;
2250 : }
2251 :
2252 : return TRUE;
2253 : }
2254 :
2255 : {
2256 : const U8* const send = s + len;
2257 : const U8* x = first_variant;
2258 : STRLEN outlen = first_variant - s;
2259 :
2260 : while (x < send) {
2261 : const STRLEN cur_len = isUTF8_CHAR(x, send);
2262 : if (UNLIKELY(! cur_len)) {
2263 : break;
2264 : }
2265 : x += cur_len;
2266 : outlen++;
2267 : }
2268 :
2269 : if (el)
2270 : *el = outlen;
2271 :
2272 : if (ep) {
2273 : *ep = x;
2274 : }
2275 :
2276 : return (x == send);
2277 : }
2278 : }
2279 :
2280 : /* The perl core arranges to never call the DFA below without there being at
2281 : * least one byte available to look at. This allows the DFA to use a do {}
2282 : * while loop which means that calling it with a UTF-8 invariant has a single
2283 : * conditional, same as the calling code checking for invariance ahead of time.
2284 : * And having the calling code remove that conditional speeds up by that
2285 : * conditional, the case where it wasn't invariant. So there's no reason to
2286 : * check before caling this.
2287 : *
2288 : * But we don't know this for non-core calls, so have to retain the check for
2289 : * them. */
2290 : #ifdef PERL_CORE
2291 : # define PERL_NON_CORE_CHECK_EMPTY(s,e) assert((e) > (s))
2292 : #else
2293 : # define PERL_NON_CORE_CHECK_EMPTY(s,e) if ((e) <= (s)) return FALSE
2294 : #endif
2295 :
2296 : /*
2297 : * DFA for checking input is valid UTF-8 syntax.
2298 : *
2299 : * This uses adaptations of the table and algorithm given in
2300 : * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
2301 : * documentation of the original version. A copyright notice for the original
2302 : * version is given at the beginning of this file. The Perl adaptations are
2303 : * documented at the definition of PL_extended_utf8_dfa_tab[].
2304 : *
2305 : * This dfa is fast. There are three exit conditions:
2306 : * 1) a well-formed code point, acceptable to the table
2307 : * 2) the beginning bytes of an incomplete character, whose completion might
2308 : * or might not be acceptable
2309 : * 3) unacceptable to the table. Some of the adaptations have certain,
2310 : * hopefully less likely to occur, legal inputs be unacceptable to the
2311 : * table, so these must be sorted out afterwards.
2312 : *
2313 : * This macro is a complete implementation of the code executing the DFA. It
2314 : * is passed the input sequence bounds and the table to use, and what to do
2315 : * for each of the exit conditions. There are three canned actions, likely to
2316 : * be the ones you want:
2317 : * DFA_RETURN_SUCCESS_
2318 : * DFA_RETURN_FAILURE_
2319 : * DFA_GOTO_TEASE_APART_FF_
2320 : *
2321 : * You pass a parameter giving the action to take for each of the three
2322 : * possible exit conditions:
2323 : *
2324 : * 'accept_action' This is executed when the DFA accepts the input.
2325 : * DFA_RETURN_SUCCESS_ is the most likely candidate.
2326 : * 'reject_action' This is executed when the DFA rejects the input.
2327 : * DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where
2328 : * you have written code to distinguish the rejecting state
2329 : * results. Because it happens in several places, and
2330 : * involves #ifdefs, the special action
2331 : * DFA_GOTO_TEASE_APART_FF_ is what you want with
2332 : * PL_extended_utf8_dfa_tab. On platforms without
2333 : * EXTRA_LONG_UTF8, there is no need to tease anything apart,
2334 : * so this evaluates to DFA_RETURN_FAILURE_; otherwise you
2335 : * need to have a label 'tease_apart_FF' that it will transfer
2336 : * to.
2337 : * 'incomplete_char_action' This is executed when the DFA ran off the end
2338 : * before accepting or rejecting the input.
2339 : * DFA_RETURN_FAILURE_ is the likely action, but you could
2340 : * have a 'goto', or NOOP. In the latter case the DFA drops
2341 : * off the end, and you place your code to handle this case
2342 : * immediately after it.
2343 : */
2344 :
2345 : #define DFA_RETURN_SUCCESS_ return s - s0
2346 : #define DFA_RETURN_FAILURE_ return 0
2347 : #ifdef HAS_EXTRA_LONG_UTF8
2348 : # define DFA_TEASE_APART_FF_ goto tease_apart_FF
2349 : #else
2350 : # define DFA_TEASE_APART_FF_ DFA_RETURN_FAILURE_
2351 : #endif
2352 :
2353 : #define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab, \
2354 : accept_action, \
2355 : reject_action, \
2356 : incomplete_char_action) \
2357 : STMT_START { \
2358 : const U8 * s = s0; \
2359 : const U8 * e_ = e; \
2360 : UV state = 0; \
2361 : \
2362 : PERL_NON_CORE_CHECK_EMPTY(s, e_); \
2363 : \
2364 : do { \
2365 : state = dfa_tab[256 + state + dfa_tab[*s]]; \
2366 : s++; \
2367 : \
2368 : if (state == 0) { /* Accepting state */ \
2369 : accept_action; \
2370 : } \
2371 : \
2372 : if (UNLIKELY(state == 1)) { /* Rejecting state */ \
2373 : reject_action; \
2374 : } \
2375 : } while (s < e_); \
2376 : \
2377 : /* Here, dropped out of loop before end-of-char */ \
2378 : incomplete_char_action; \
2379 : } STMT_END
2380 :
2381 :
2382 : /*
2383 :
2384 : =for apidoc isUTF8_CHAR
2385 :
2386 : Evaluates to non-zero if the first few bytes of the string starting at C<s> and
2387 : looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
2388 : that represents some code point; otherwise it evaluates to 0. If non-zero, the
2389 : value gives how many bytes starting at C<s> comprise the code point's
2390 : representation. Any bytes remaining before C<e>, but beyond the ones needed to
2391 : form the first code point in C<s>, are not examined.
2392 :
2393 : The code point can be any that will fit in an IV on this machine, using Perl's
2394 : extension to official UTF-8 to represent those higher than the Unicode maximum
2395 : of 0x10FFFF. That means that this macro is used to efficiently decide if the
2396 : next few bytes in C<s> is legal UTF-8 for a single character.
2397 :
2398 : Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
2399 : defined by Unicode to be fully interchangeable across applications;
2400 : C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
2401 : #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
2402 : code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
2403 :
2404 : Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
2405 : C<L</is_utf8_string_loclen>> to check entire strings.
2406 :
2407 : Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
2408 : machines) is a valid UTF-8 character.
2409 :
2410 : =cut
2411 :
2412 : This uses an adaptation of the table and algorithm given in
2413 : https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
2414 : documentation of the original version. A copyright notice for the original
2415 : version is given at the beginning of this file. The Perl adaptation is
2416 : documented at the definition of PL_extended_utf8_dfa_tab[].
2417 : */
2418 :
2419 : PERL_STATIC_INLINE Size_t
2420 : Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
2421 : {
2422 : PERL_ARGS_ASSERT_ISUTF8_CHAR;
2423 :
2424 : PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2425 : DFA_RETURN_SUCCESS_,
2426 : DFA_TEASE_APART_FF_,
2427 : DFA_RETURN_FAILURE_);
2428 :
2429 : /* Here, we didn't return success, but dropped out of the loop. In the
2430 : * case of PL_extended_utf8_dfa_tab, this means the input is either
2431 : * malformed, or the start byte was FF on a platform that the dfa doesn't
2432 : * handle FF's. Call a helper function. */
2433 :
2434 : #ifdef HAS_EXTRA_LONG_UTF8
2435 :
2436 : tease_apart_FF:
2437 :
2438 : /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
2439 : * either malformed, or was for the largest possible start byte, which we
2440 : * now check, not inline */
2441 : if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) {
2442 : return 0;
2443 : }
2444 :
2445 : return is_utf8_FF_helper_(s0, e,
2446 : FALSE /* require full, not partial char */
2447 : );
2448 : #endif
2449 :
2450 : }
2451 :
2452 : /*
2453 :
2454 : =for apidoc isSTRICT_UTF8_CHAR
2455 :
2456 : Evaluates to non-zero if the first few bytes of the string starting at C<s> and
2457 : looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
2458 : Unicode code point completely acceptable for open interchange between all
2459 : applications; otherwise it evaluates to 0. If non-zero, the value gives how
2460 : many bytes starting at C<s> comprise the code point's representation. Any
2461 : bytes remaining before C<e>, but beyond the ones needed to form the first code
2462 : point in C<s>, are not examined.
2463 :
2464 : The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
2465 : be a surrogate nor a non-character code point. Thus this excludes any code
2466 : point from Perl's extended UTF-8.
2467 :
2468 : This is used to efficiently decide if the next few bytes in C<s> is
2469 : legal Unicode-acceptable UTF-8 for a single character.
2470 :
2471 : Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
2472 : #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
2473 : code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
2474 : and C<L</isUTF8_CHAR_flags>> for a more customized definition.
2475 :
2476 : Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
2477 : C<L</is_strict_utf8_string_loclen>> to check entire strings.
2478 :
2479 : =cut
2480 :
2481 : This uses an adaptation of the tables and algorithm given in
2482 : https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
2483 : documentation of the original version. A copyright notice for the original
2484 : version is given at the beginning of this file. The Perl adaptation is
2485 : documented at the definition of strict_extended_utf8_dfa_tab[].
2486 :
2487 : */
2488 :
2489 : PERL_STATIC_INLINE Size_t
2490 : Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
2491 : {
2492 : PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
2493 :
2494 : PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab,
2495 : DFA_RETURN_SUCCESS_,
2496 : goto check_hanguls,
2497 : DFA_RETURN_FAILURE_);
2498 : check_hanguls:
2499 :
2500 : /* Here, we didn't return success, but dropped out of the loop. In the
2501 : * case of PL_strict_utf8_dfa_tab, this means the input is either
2502 : * malformed, or was for certain Hanguls; handle them specially */
2503 :
2504 : /* The dfa above drops out for incomplete or illegal inputs, and certain
2505 : * legal Hanguls; check and return accordingly */
2506 : return is_HANGUL_ED_utf8_safe(s0, e);
2507 : }
2508 :
2509 : /*
2510 :
2511 : =for apidoc isC9_STRICT_UTF8_CHAR
2512 :
2513 : Evaluates to non-zero if the first few bytes of the string starting at C<s> and
2514 : looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
2515 : Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
2516 : the value gives how many bytes starting at C<s> comprise the code point's
2517 : representation. Any bytes remaining before C<e>, but beyond the ones needed to
2518 : form the first code point in C<s>, are not examined.
2519 :
2520 : The largest acceptable code point is the Unicode maximum 0x10FFFF. This
2521 : differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
2522 : code points. This corresponds to
2523 : L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
2524 : which said that non-character code points are merely discouraged rather than
2525 : completely forbidden in open interchange. See
2526 : L<perlunicode/Noncharacter code points>.
2527 :
2528 : Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
2529 : C<L</isUTF8_CHAR_flags>> for a more customized definition.
2530 :
2531 : Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
2532 : C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
2533 :
2534 : =cut
2535 :
2536 : This uses an adaptation of the tables and algorithm given in
2537 : https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
2538 : documentation of the original version. A copyright notice for the original
2539 : version is given at the beginning of this file. The Perl adaptation is
2540 : documented at the definition of PL_c9_utf8_dfa_tab[].
2541 :
2542 : */
2543 :
2544 : PERL_STATIC_INLINE Size_t
2545 : Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
2546 : {
2547 : PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
2548 :
2549 : PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab,
2550 : DFA_RETURN_SUCCESS_,
2551 : DFA_RETURN_FAILURE_,
2552 : DFA_RETURN_FAILURE_);
2553 : }
2554 :
2555 : /*
2556 :
2557 : =for apidoc is_strict_utf8_string_loc
2558 :
2559 : Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
2560 : case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2561 : "utf8ness success") in the C<ep> pointer.
2562 :
2563 : See also C<L</is_strict_utf8_string_loclen>>.
2564 :
2565 : =cut
2566 : */
2567 :
2568 : #define is_strict_utf8_string_loc(s, len, ep) \
2569 : is_strict_utf8_string_loclen(s, len, ep, 0)
2570 :
2571 : /*
2572 :
2573 : =for apidoc is_strict_utf8_string_loclen
2574 :
2575 : Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
2576 : case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2577 : "utf8ness success") in the C<ep> pointer, and the number of UTF-8
2578 : encoded characters in the C<el> pointer.
2579 :
2580 : See also C<L</is_strict_utf8_string_loc>>.
2581 :
2582 : =cut
2583 : */
2584 :
2585 : PERL_STATIC_INLINE bool
2586 : Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
2587 : {
2588 : const U8 * first_variant;
2589 :
2590 : PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
2591 :
2592 : if (len == 0) {
2593 : len = strlen((const char *) s);
2594 : }
2595 :
2596 : if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
2597 : if (el)
2598 : *el = len;
2599 :
2600 : if (ep) {
2601 : *ep = s + len;
2602 : }
2603 :
2604 : return TRUE;
2605 : }
2606 :
2607 : {
2608 : const U8* const send = s + len;
2609 : const U8* x = first_variant;
2610 : STRLEN outlen = first_variant - s;
2611 :
2612 : while (x < send) {
2613 : const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
2614 : if (UNLIKELY(! cur_len)) {
2615 : break;
2616 : }
2617 : x += cur_len;
2618 : outlen++;
2619 : }
2620 :
2621 : if (el)
2622 : *el = outlen;
2623 :
2624 : if (ep) {
2625 : *ep = x;
2626 : }
2627 :
2628 : return (x == send);
2629 : }
2630 : }
2631 :
2632 : /*
2633 :
2634 : =for apidoc is_c9strict_utf8_string_loc
2635 :
2636 : Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
2637 : the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2638 : "utf8ness success") in the C<ep> pointer.
2639 :
2640 : See also C<L</is_c9strict_utf8_string_loclen>>.
2641 :
2642 : =cut
2643 : */
2644 :
2645 : #define is_c9strict_utf8_string_loc(s, len, ep) \
2646 : is_c9strict_utf8_string_loclen(s, len, ep, 0)
2647 :
2648 : /*
2649 :
2650 : =for apidoc is_c9strict_utf8_string_loclen
2651 :
2652 : Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
2653 : the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2654 : "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
2655 : characters in the C<el> pointer.
2656 :
2657 : See also C<L</is_c9strict_utf8_string_loc>>.
2658 :
2659 : =cut
2660 : */
2661 :
2662 : PERL_STATIC_INLINE bool
2663 : Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
2664 : {
2665 : const U8 * first_variant;
2666 :
2667 : PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
2668 :
2669 : if (len == 0) {
2670 : len = strlen((const char *) s);
2671 : }
2672 :
2673 : if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
2674 : if (el)
2675 : *el = len;
2676 :
2677 : if (ep) {
2678 : *ep = s + len;
2679 : }
2680 :
2681 : return TRUE;
2682 : }
2683 :
2684 : {
2685 : const U8* const send = s + len;
2686 : const U8* x = first_variant;
2687 : STRLEN outlen = first_variant - s;
2688 :
2689 : while (x < send) {
2690 : const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
2691 : if (UNLIKELY(! cur_len)) {
2692 : break;
2693 : }
2694 : x += cur_len;
2695 : outlen++;
2696 : }
2697 :
2698 : if (el)
2699 : *el = outlen;
2700 :
2701 : if (ep) {
2702 : *ep = x;
2703 : }
2704 :
2705 : return (x == send);
2706 : }
2707 : }
2708 :
2709 : /*
2710 :
2711 : =for apidoc is_utf8_string_loc_flags
2712 :
2713 : Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
2714 : case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2715 : "utf8ness success") in the C<ep> pointer.
2716 :
2717 : See also C<L</is_utf8_string_loclen_flags>>.
2718 :
2719 : =cut
2720 : */
2721 :
2722 : #define is_utf8_string_loc_flags(s, len, ep, flags) \
2723 : is_utf8_string_loclen_flags(s, len, ep, 0, flags)
2724 :
2725 :
2726 : /* The above 3 actual functions could have been moved into the more general one
2727 : * just below, and made #defines that call it with the right 'flags'. They are
2728 : * currently kept separate to increase their chances of getting inlined */
2729 :
2730 : /*
2731 :
2732 : =for apidoc is_utf8_string_loclen_flags
2733 :
2734 : Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
2735 : case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2736 : "utf8ness success") in the C<ep> pointer, and the number of UTF-8
2737 : encoded characters in the C<el> pointer.
2738 :
2739 : See also C<L</is_utf8_string_loc_flags>>.
2740 :
2741 : =cut
2742 : */
2743 :
2744 : PERL_STATIC_INLINE bool
2745 : Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
2746 : {
2747 : const U8 * first_variant;
2748 :
2749 : PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
2750 : assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2751 : |UTF8_DISALLOW_PERL_EXTENDED)));
2752 :
2753 : if (flags == 0) {
2754 : return is_utf8_string_loclen(s, len, ep, el);
2755 : }
2756 :
2757 : if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
2758 : == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
2759 : {
2760 : return is_strict_utf8_string_loclen(s, len, ep, el);
2761 : }
2762 :
2763 : if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
2764 : == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
2765 : {
2766 : return is_c9strict_utf8_string_loclen(s, len, ep, el);
2767 : }
2768 :
2769 : if (len == 0) {
2770 : len = strlen((const char *) s);
2771 : }
2772 :
2773 : if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
2774 : if (el)
2775 : *el = len;
2776 :
2777 : if (ep) {
2778 : *ep = s + len;
2779 : }
2780 :
2781 : return TRUE;
2782 : }
2783 :
2784 : {
2785 : const U8* send = s + len;
2786 : const U8* x = first_variant;
2787 : STRLEN outlen = first_variant - s;
2788 :
2789 : while (x < send) {
2790 : const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
2791 : if (UNLIKELY(! cur_len)) {
2792 : break;
2793 : }
2794 : x += cur_len;
2795 : outlen++;
2796 : }
2797 :
2798 : if (el)
2799 : *el = outlen;
2800 :
2801 : if (ep) {
2802 : *ep = x;
2803 : }
2804 :
2805 : return (x == send);
2806 : }
2807 : }
2808 :
2809 : /*
2810 : =for apidoc utf8_distance
2811 :
2812 : Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
2813 : and C<b>.
2814 :
2815 : WARNING: use only if you *know* that the pointers point inside the
2816 : same UTF-8 buffer.
2817 :
2818 : =cut
2819 : */
2820 :
2821 : PERL_STATIC_INLINE IV
2822 : Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
2823 : {
2824 : PERL_ARGS_ASSERT_UTF8_DISTANCE;
2825 :
2826 : return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
2827 : }
2828 :
2829 : /*
2830 : =for apidoc utf8_hop
2831 :
2832 : Return the UTF-8 pointer C<s> displaced by C<off> characters, either
2833 : forward (if C<off> is positive) or backward (if negative). C<s> does not need
2834 : to be pointing to the starting byte of a character. If it isn't, one count of
2835 : C<off> will be used up to get to the start of the next character for forward
2836 : hops, and to the start of the current character for negative ones.
2837 :
2838 : WARNING: Prefer L</utf8_hop_safe> to this one.
2839 :
2840 : Do NOT use this function unless you B<know> C<off> is within
2841 : the UTF-8 data pointed to by C<s> B<and> that on entry C<s> is aligned
2842 : on the first byte of a character or just after the last byte of a character.
2843 :
2844 : =cut
2845 : */
2846 :
2847 : PERL_STATIC_INLINE U8 *
2848 : Perl_utf8_hop(const U8 *s, SSize_t off)
2849 : {
2850 : PERL_ARGS_ASSERT_UTF8_HOP;
2851 :
2852 : /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2853 : * the XXX bitops (especially ~) can create illegal UTF-8.
2854 : * In other words: in Perl UTF-8 is not just for Unicode. */
2855 :
2856 : if (off > 0) {
2857 :
2858 : /* Get to next non-continuation byte */
2859 : if (UNLIKELY(UTF8_IS_CONTINUATION(*s))) {
2860 : do {
2861 : s++;
2862 : }
2863 : while (UTF8_IS_CONTINUATION(*s));
2864 : off--;
2865 : }
2866 :
2867 : while (off--)
2868 : s += UTF8SKIP(s);
2869 : }
2870 : else {
2871 : while (off++) {
2872 : s--;
2873 : while (UTF8_IS_CONTINUATION(*s))
2874 : s--;
2875 : }
2876 : }
2877 :
2878 : GCC_DIAG_IGNORE(-Wcast-qual)
2879 : return (U8 *)s;
2880 : GCC_DIAG_RESTORE
2881 : }
2882 :
2883 : /*
2884 : =for apidoc utf8_hop_forward
2885 :
2886 : Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2887 : forward. C<s> does not need to be pointing to the starting byte of a
2888 : character. If it isn't, one count of C<off> will be used up to get to the
2889 : start of the next character.
2890 :
2891 : C<off> must be non-negative.
2892 :
2893 : C<s> must be before or equal to C<end>.
2894 :
2895 : When moving forward it will not move beyond C<end>.
2896 :
2897 : Will not exceed this limit even if the string is not valid "UTF-8".
2898 :
2899 : =cut
2900 : */
2901 :
2902 : PERL_STATIC_INLINE U8 *
2903 : Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
2904 : {
2905 : PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
2906 :
2907 : /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2908 : * the bitops (especially ~) can create illegal UTF-8.
2909 : * In other words: in Perl UTF-8 is not just for Unicode. */
2910 :
2911 : assert(s <= end);
2912 : assert(off >= 0);
2913 :
2914 : if (off && UNLIKELY(UTF8_IS_CONTINUATION(*s))) {
2915 : /* Get to next non-continuation byte */
2916 : do {
2917 : s++;
2918 : }
2919 : while (UTF8_IS_CONTINUATION(*s));
2920 : off--;
2921 : }
2922 :
2923 : while (off--) {
2924 : STRLEN skip = UTF8SKIP(s);
2925 : if ((STRLEN)(end - s) <= skip) {
2926 : GCC_DIAG_IGNORE(-Wcast-qual)
2927 : return (U8 *)end;
2928 : GCC_DIAG_RESTORE
2929 : }
2930 : s += skip;
2931 : }
2932 :
2933 : GCC_DIAG_IGNORE(-Wcast-qual)
2934 : return (U8 *)s;
2935 : GCC_DIAG_RESTORE
2936 : }
2937 :
2938 : /*
2939 : =for apidoc utf8_hop_back
2940 :
2941 : Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2942 : backward. C<s> does not need to be pointing to the starting byte of a
2943 : character. If it isn't, one count of C<off> will be used up to get to that
2944 : start.
2945 :
2946 : C<off> must be non-positive.
2947 :
2948 : C<s> must be after or equal to C<start>.
2949 :
2950 : When moving backward it will not move before C<start>.
2951 :
2952 : Will not exceed this limit even if the string is not valid "UTF-8".
2953 :
2954 : =cut
2955 : */
2956 :
2957 : PERL_STATIC_INLINE U8 *
2958 : Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
2959 : {
2960 : PERL_ARGS_ASSERT_UTF8_HOP_BACK;
2961 :
2962 : /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2963 : * the bitops (especially ~) can create illegal UTF-8.
2964 : * In other words: in Perl UTF-8 is not just for Unicode. */
2965 :
2966 : assert(start <= s);
2967 : assert(off <= 0);
2968 :
2969 : /* Note: if we know that the input is well-formed, we can do per-word
2970 : * hop-back. Commit d6ad3b72778369a84a215b498d8d60d5b03aa1af implemented
2971 : * that. But it was reverted because doing per-word has some
2972 : * start-up/tear-down overhead, so only makes sense if the distance to be
2973 : * moved is large, and core perl doesn't currently move more than a few
2974 : * characters at a time. You can reinstate it if it does become
2975 : * advantageous. */
2976 : while (off++ && s > start) {
2977 : do {
2978 : s--;
2979 : } while (s > start && UTF8_IS_CONTINUATION(*s));
2980 : }
2981 :
2982 : GCC_DIAG_IGNORE(-Wcast-qual)
2983 : return (U8 *)s;
2984 : GCC_DIAG_RESTORE
2985 : }
2986 :
2987 : /*
2988 : =for apidoc utf8_hop_safe
2989 :
2990 : Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2991 : either forward or backward. C<s> does not need to be pointing to the starting
2992 : byte of a character. If it isn't, one count of C<off> will be used up to get
2993 : to the start of the next character for forward hops, and to the start of the
2994 : current character for negative ones.
2995 :
2996 : When moving backward it will not move before C<start>.
2997 :
2998 : When moving forward it will not move beyond C<end>.
2999 :
3000 : Will not exceed those limits even if the string is not valid "UTF-8".
3001 :
3002 : =cut
3003 : */
3004 :
3005 : PERL_STATIC_INLINE U8 *
3006 : Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
3007 : {
3008 : PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
3009 :
3010 : /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
3011 : * the bitops (especially ~) can create illegal UTF-8.
3012 : * In other words: in Perl UTF-8 is not just for Unicode. */
3013 :
3014 : assert(start <= s && s <= end);
3015 :
3016 : if (off >= 0) {
3017 : return utf8_hop_forward(s, off, end);
3018 : }
3019 : else {
3020 : return utf8_hop_back(s, off, start);
3021 : }
3022 : }
3023 :
3024 : /*
3025 :
3026 : =for apidoc isUTF8_CHAR_flags
3027 :
3028 : Evaluates to non-zero if the first few bytes of the string starting at C<s> and
3029 : looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
3030 : that represents some code point, subject to the restrictions given by C<flags>;
3031 : otherwise it evaluates to 0. If non-zero, the value gives how many bytes
3032 : starting at C<s> comprise the code point's representation. Any bytes remaining
3033 : before C<e>, but beyond the ones needed to form the first code point in C<s>,
3034 : are not examined.
3035 :
3036 : If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
3037 : if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
3038 : as C<L</isSTRICT_UTF8_CHAR>>;
3039 : and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
3040 : the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
3041 : Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
3042 : understood by C<L</utf8n_to_uvchr>>, with the same meanings.
3043 :
3044 : The three alternative macros are for the most commonly needed validations; they
3045 : are likely to run somewhat faster than this more general one, as they can be
3046 : inlined into your code.
3047 :
3048 : Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
3049 : L</is_utf8_string_loclen_flags> to check entire strings.
3050 :
3051 : =cut
3052 : */
3053 :
3054 : PERL_STATIC_INLINE STRLEN
3055 : Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
3056 : {
3057 : PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS;
3058 : assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
3059 : |UTF8_DISALLOW_PERL_EXTENDED)));
3060 :
3061 : PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
3062 : goto check_success,
3063 : DFA_TEASE_APART_FF_,
3064 : DFA_RETURN_FAILURE_);
3065 :
3066 : check_success:
3067 :
3068 : return is_utf8_char_helper_(s0, e, flags);
3069 :
3070 : #ifdef HAS_EXTRA_LONG_UTF8
3071 :
3072 : tease_apart_FF:
3073 :
3074 : /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
3075 : * either malformed, or was for the largest possible start byte, which
3076 : * indicates perl extended UTF-8, well above the Unicode maximum */
3077 : if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
3078 : || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
3079 : {
3080 : return 0;
3081 : }
3082 :
3083 : /* Otherwise examine the sequence not inline */
3084 : return is_utf8_FF_helper_(s0, e,
3085 : FALSE /* require full, not partial char */
3086 : );
3087 : #endif
3088 :
3089 : }
3090 :
3091 : /*
3092 :
3093 : =for apidoc is_utf8_valid_partial_char
3094 :
3095 : Returns 0 if the sequence of bytes starting at C<s> and looking no further than
3096 : S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
3097 : points. Otherwise, it returns 1 if there exists at least one non-empty
3098 : sequence of bytes that when appended to sequence C<s>, starting at position
3099 : C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
3100 : otherwise returns 0.
3101 :
3102 : In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
3103 : point.
3104 :
3105 : This is useful when a fixed-length buffer is being tested for being well-formed
3106 : UTF-8, but the final few bytes in it don't comprise a full character; that is,
3107 : it is split somewhere in the middle of the final code point's UTF-8
3108 : representation. (Presumably when the buffer is refreshed with the next chunk
3109 : of data, the new first bytes will complete the partial code point.) This
3110 : function is used to verify that the final bytes in the current buffer are in
3111 : fact the legal beginning of some code point, so that if they aren't, the
3112 : failure can be signalled without having to wait for the next read.
3113 :
3114 : =cut
3115 : */
3116 : #define is_utf8_valid_partial_char(s, e) \
3117 : is_utf8_valid_partial_char_flags(s, e, 0)
3118 :
3119 : /*
3120 :
3121 : =for apidoc is_utf8_valid_partial_char_flags
3122 :
3123 : Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
3124 : or not the input is a valid UTF-8 encoded partial character, but it takes an
3125 : extra parameter, C<flags>, which can further restrict which code points are
3126 : considered valid.
3127 :
3128 : If C<flags> is 0, this behaves identically to
3129 : C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
3130 : of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
3131 : there is any sequence of bytes that can complete the input partial character in
3132 : such a way that a non-prohibited character is formed, the function returns
3133 : TRUE; otherwise FALSE. Non character code points cannot be determined based on
3134 : partial character input. But many of the other possible excluded types can be
3135 : determined from just the first one or two bytes.
3136 :
3137 : =cut
3138 : */
3139 :
3140 : PERL_STATIC_INLINE bool
3141 : Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
3142 : {
3143 : PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
3144 : assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
3145 : |UTF8_DISALLOW_PERL_EXTENDED)));
3146 :
3147 : PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
3148 : DFA_RETURN_FAILURE_,
3149 : DFA_TEASE_APART_FF_,
3150 : NOOP);
3151 :
3152 : /* The NOOP above causes the DFA to drop down here iff the input was a
3153 : * partial character. flags=0 => can return TRUE immediately; otherwise we
3154 : * need to check (not inline) if the partial character is the beginning of
3155 : * a disallowed one */
3156 : if (flags == 0) {
3157 : return TRUE;
3158 : }
3159 :
3160 : return cBOOL(is_utf8_char_helper_(s0, e, flags));
3161 :
3162 : #ifdef HAS_EXTRA_LONG_UTF8
3163 :
3164 : tease_apart_FF:
3165 :
3166 : /* Getting here means the input is either malformed, or, in the case of
3167 : * PL_extended_utf8_dfa_tab, was for the largest possible start byte. The
3168 : * latter case has to be extended UTF-8, so can fail immediately if that is
3169 : * forbidden */
3170 :
3171 : if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
3172 : || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
3173 : {
3174 : return 0;
3175 : }
3176 :
3177 : return is_utf8_FF_helper_(s0, e,
3178 : TRUE /* Require to be a partial character */
3179 : );
3180 : #endif
3181 :
3182 : }
3183 :
3184 : /*
3185 :
3186 : =for apidoc is_utf8_fixed_width_buf_flags
3187 :
3188 : Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
3189 : is entirely valid UTF-8, subject to the restrictions given by C<flags>;
3190 : otherwise it returns FALSE.
3191 :
3192 : If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
3193 : without restriction. If the final few bytes of the buffer do not form a
3194 : complete code point, this will return TRUE anyway, provided that
3195 : C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
3196 :
3197 : If C<flags> in non-zero, it can be any combination of the
3198 : C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
3199 : same meanings.
3200 :
3201 : This function differs from C<L</is_utf8_string_flags>> only in that the latter
3202 : returns FALSE if the final few bytes of the string don't form a complete code
3203 : point.
3204 :
3205 : =cut
3206 : */
3207 : #define is_utf8_fixed_width_buf_flags(s, len, flags) \
3208 : is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
3209 :
3210 : /*
3211 :
3212 : =for apidoc is_utf8_fixed_width_buf_loc_flags
3213 :
3214 : Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
3215 : failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
3216 : to the beginning of any partial character at the end of the buffer; if there is
3217 : no partial character C<*ep> will contain C<s>+C<len>.
3218 :
3219 : See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
3220 :
3221 : =cut
3222 : */
3223 :
3224 : #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
3225 : is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
3226 :
3227 : /*
3228 :
3229 : =for apidoc is_utf8_fixed_width_buf_loclen_flags
3230 :
3231 : Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
3232 : complete, valid characters found in the C<el> pointer.
3233 :
3234 : =cut
3235 : */
3236 :
3237 : PERL_STATIC_INLINE bool
3238 : Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
3239 : STRLEN len,
3240 : const U8 **ep,
3241 : STRLEN *el,
3242 : const U32 flags)
3243 : {
3244 : const U8 * maybe_partial;
3245 :
3246 : PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
3247 :
3248 : if (! ep) {
3249 : ep = &maybe_partial;
3250 : }
3251 :
3252 : /* If it's entirely valid, return that; otherwise see if the only error is
3253 : * that the final few bytes are for a partial character */
3254 : return is_utf8_string_loclen_flags(s, len, ep, el, flags)
3255 : || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
3256 : }
3257 :
3258 : PERL_STATIC_INLINE UV
3259 : Perl_utf8n_to_uvchr_msgs(const U8 *s,
3260 : STRLEN curlen,
3261 : STRLEN *retlen,
3262 : const U32 flags,
3263 : U32 * errors,
3264 : AV ** msgs)
3265 : {
3266 : /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
3267 : * simple cases, and, if necessary calls a helper function to deal with the
3268 : * more complex ones. Almost all well-formed non-problematic code points
3269 : * are considered simple, so that it's unlikely that the helper function
3270 : * will need to be called.
3271 : *
3272 : * This is an adaptation of the tables and algorithm given in
3273 : * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
3274 : * comprehensive documentation of the original version. A copyright notice
3275 : * for the original version is given at the beginning of this file. The
3276 : * Perl adaptation is documented at the definition of PL_strict_utf8_dfa_tab[].
3277 : */
3278 :
3279 : const U8 * const s0 = s;
3280 : const U8 * send = s0 + curlen;
3281 : UV type;
3282 : UV uv;
3283 :
3284 : PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
3285 :
3286 : /* This dfa is fast. If it accepts the input, it was for a well-formed,
3287 : * non-problematic code point, which can be returned immediately.
3288 : * Otherwise we call a helper function to figure out the more complicated
3289 : * cases. */
3290 :
3291 : /* No calls from core pass in an empty string; non-core need a check */
3292 : #ifdef PERL_CORE
3293 : assert(curlen > 0);
3294 : #else
3295 : if (curlen == 0) return _utf8n_to_uvchr_msgs_helper(s0, 0, retlen,
3296 : flags, errors, msgs);
3297 : #endif
3298 :
3299 : type = PL_strict_utf8_dfa_tab[*s];
3300 :
3301 : /* The table is structured so that 'type' is 0 iff the input byte is
3302 : * represented identically regardless of the UTF-8ness of the string */
3303 : if (type == 0) { /* UTF-8 invariants are returned unchanged */
3304 : uv = *s;
3305 : }
3306 : else {
3307 : UV state = PL_strict_utf8_dfa_tab[256 + type];
3308 : uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s);
3309 :
3310 : while (LIKELY(state != 1) && ++s < send) {
3311 : type = PL_strict_utf8_dfa_tab[*s];
3312 : state = PL_strict_utf8_dfa_tab[256 + state + type];
3313 :
3314 : uv = UTF8_ACCUMULATE(uv, *s);
3315 :
3316 : if (state == 0) {
3317 : #ifdef EBCDIC
3318 : uv = UNI_TO_NATIVE(uv);
3319 : #endif
3320 : goto success;
3321 : }
3322 : }
3323 :
3324 : /* Here is potentially problematic. Use the full mechanism */
3325 : return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags,
3326 : errors, msgs);
3327 : }
3328 :
3329 : success:
3330 : if (retlen) {
3331 : *retlen = s - s0 + 1;
3332 : }
3333 : if (errors) {
3334 : *errors = 0;
3335 : }
3336 : if (msgs) {
3337 : *msgs = NULL;
3338 : }
3339 :
3340 : return uv;
3341 : }
3342 :
3343 : PERL_STATIC_INLINE UV
3344 : Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
3345 : {
3346 : PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
3347 :
3348 : assert(s < send);
3349 :
3350 : if (! ckWARN_d(WARN_UTF8)) {
3351 :
3352 : /* EMPTY is not really allowed, and asserts on debugging builds. But
3353 : * on non-debugging we have to deal with it, and this causes it to
3354 : * return the REPLACEMENT CHARACTER, as the documentation indicates */
3355 : return utf8n_to_uvchr(s, send - s, retlen,
3356 : (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
3357 : }
3358 : else {
3359 : UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
3360 : if (retlen && ret == 0 && (send <= s || *s != '\0')) {
3361 : *retlen = (STRLEN) -1;
3362 : }
3363 :
3364 : return ret;
3365 : }
3366 : }
3367 :
3368 : /* ------------------------------- perl.h ----------------------------- */
3369 :
3370 : /*
3371 : =for apidoc_section $utility
3372 :
3373 : =for apidoc is_safe_syscall
3374 :
3375 : Test that the given C<pv> (with length C<len>) doesn't contain any internal
3376 : C<NUL> characters.
3377 : If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
3378 : category, and return FALSE.
3379 :
3380 : Return TRUE if the name is safe.
3381 :
3382 : C<what> and C<op_name> are used in any warning.
3383 :
3384 : Used by the C<IS_SAFE_SYSCALL()> macro.
3385 :
3386 : =cut
3387 : */
3388 :
3389 : PERL_STATIC_INLINE bool
3390 : Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
3391 : {
3392 : /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
3393 : * perl itself uses xce*() functions which accept 8-bit strings.
3394 : */
3395 :
3396 : PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
3397 :
3398 : if (len > 1) {
3399 : char *null_at;
3400 : if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
3401 : SETERRNO(ENOENT, LIB_INVARG);
3402 : Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
3403 : "Invalid \\0 character in %s for %s: %s\\0%s",
3404 : what, op_name, pv, null_at+1);
3405 : return FALSE;
3406 : }
3407 : }
3408 :
3409 : return TRUE;
3410 : }
3411 :
3412 : /*
3413 :
3414 : Return true if the supplied filename has a newline character
3415 : immediately before the first (hopefully only) NUL.
3416 :
3417 : My original look at this incorrectly used the len from SvPV(), but
3418 : that's incorrect, since we allow for a NUL in pv[len-1].
3419 :
3420 : So instead, strlen() and work from there.
3421 :
3422 : This allow for the user reading a filename, forgetting to chomp it,
3423 : then calling:
3424 :
3425 : open my $foo, "$file\0";
3426 :
3427 : */
3428 :
3429 : #ifdef PERL_CORE
3430 :
3431 : PERL_STATIC_INLINE bool
3432 : S_should_warn_nl(const char *pv)
3433 : {
3434 : STRLEN len;
3435 :
3436 : PERL_ARGS_ASSERT_SHOULD_WARN_NL;
3437 :
3438 : len = strlen(pv);
3439 :
3440 : return len > 0 && pv[len-1] == '\n';
3441 : }
3442 :
3443 : #endif
3444 :
3445 : #if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
3446 :
3447 : PERL_STATIC_INLINE bool
3448 : S_lossless_NV_to_IV(const NV nv, IV *ivp)
3449 : {
3450 : /* This function determines if the input NV 'nv' may be converted without
3451 : * loss of data to an IV. If not, it returns FALSE taking no other action.
3452 : * But if it is possible, it does the conversion, returning TRUE, and
3453 : * storing the converted result in '*ivp' */
3454 :
3455 : PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
3456 :
3457 : # if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
3458 : /* Normally any comparison with a NaN returns false; if we can't rely
3459 : * on that behaviour, check explicitly */
3460 : if (UNLIKELY(Perl_isnan(nv))) {
3461 : return FALSE;
3462 : }
3463 : # endif
3464 :
3465 : # ifndef NV_PRESERVES_UV
3466 : STATIC_ASSERT_STMT(((UV)1 << NV_PRESERVES_UV_BITS) - 1 <= (UV)IV_MAX);
3467 : # endif
3468 :
3469 : /* Written this way so that with an always-false NaN comparison we
3470 : * return false */
3471 : if (
3472 : # ifdef NV_PRESERVES_UV
3473 : LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1) &&
3474 : # else
3475 : /* If the condition below is not satisfied, lower bits of nv's
3476 : * integral part is already lost and accurate conversion to integer
3477 : * is impossible.
3478 : * Note this should be consistent with S_sv_2iuv_common in sv.c. */
3479 : Perl_fabs(nv) < (NV) ((UV)1 << NV_PRESERVES_UV_BITS) &&
3480 : # endif
3481 : (IV) nv == nv) {
3482 : *ivp = (IV) nv;
3483 : return TRUE;
3484 : }
3485 : return FALSE;
3486 : }
3487 :
3488 : #endif
3489 :
3490 : /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
3491 :
3492 : #if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
3493 :
3494 : #define MAX_CHARSET_NAME_LENGTH 2
3495 :
3496 : PERL_STATIC_INLINE const char *
3497 : S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
3498 : {
3499 : PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
3500 :
3501 : /* Returns a string that corresponds to the name of the regex character set
3502 : * given by 'flags', and *lenp is set the length of that string, which
3503 : * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
3504 :
3505 : *lenp = 1;
3506 : switch (get_regex_charset(flags)) {
3507 : case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
3508 : case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
3509 : case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
3510 : case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
3511 : case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3512 : *lenp = 2;
3513 : return ASCII_MORE_RESTRICT_PAT_MODS;
3514 : }
3515 : /* The NOT_REACHED; hides an assert() which has a rather complex
3516 : * definition in perl.h. */
3517 : NOT_REACHED; /* NOTREACHED */
3518 : return "?"; /* Unknown */
3519 : }
3520 :
3521 : #endif
3522 :
3523 : /*
3524 :
3525 : Return false if any get magic is on the SV other than taint magic.
3526 :
3527 : */
3528 :
3529 : PERL_STATIC_INLINE bool
3530 : Perl_sv_only_taint_gmagic(SV *sv)
3531 : {
3532 : MAGIC *mg = SvMAGIC(sv);
3533 :
3534 : PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
3535 :
3536 : while (mg) {
3537 : if (mg->mg_type != PERL_MAGIC_taint
3538 : && !(mg->mg_flags & MGf_GSKIP)
3539 : && mg->mg_virtual->svt_get) {
3540 : return FALSE;
3541 : }
3542 : mg = mg->mg_moremagic;
3543 : }
3544 :
3545 : return TRUE;
3546 : }
3547 :
3548 : /* ------------------ cop.h ------------------------------------------- */
3549 :
3550 : /* implement GIMME_V() macro */
3551 :
3552 : PERL_STATIC_INLINE U8
3553 : Perl_gimme_V(pTHX)
3554 : {
3555 : I32 cxix;
3556 : U8 gimme = (PL_op->op_flags & OPf_WANT);
3557 :
3558 : if (gimme)
3559 : return gimme;
3560 : cxix = PL_curstackinfo->si_cxsubix;
3561 : if (cxix < 0)
3562 : return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID;
3563 : assert(cxstack[cxix].blk_gimme & G_WANT);
3564 : return (cxstack[cxix].blk_gimme & G_WANT);
3565 : }
3566 :
3567 :
3568 : /* Enter a block. Push a new base context and return its address. */
3569 :
3570 : PERL_STATIC_INLINE PERL_CONTEXT *
3571 : Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
3572 : {
3573 : PERL_CONTEXT * cx;
3574 :
3575 : PERL_ARGS_ASSERT_CX_PUSHBLOCK;
3576 :
3577 : CXINC;
3578 : cx = CX_CUR();
3579 : cx->cx_type = type;
3580 : cx->blk_gimme = gimme;
3581 : cx->blk_oldsaveix = saveix;
3582 : cx->blk_oldsp = (Stack_off_t)(sp - PL_stack_base);
3583 : assert(cxstack_ix <= 0
3584 : || CxTYPE(cx-1) == CXt_SUBST
3585 : || cx->blk_oldsp >= (cx-1)->blk_oldsp);
3586 : cx->blk_oldcop = PL_curcop;
3587 : cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
3588 : cx->blk_oldscopesp = PL_scopestack_ix;
3589 : cx->blk_oldpm = PL_curpm;
3590 : cx->blk_old_tmpsfloor = PL_tmps_floor;
3591 :
3592 : PL_tmps_floor = PL_tmps_ix;
3593 : CX_DEBUG(cx, "PUSH");
3594 : return cx;
3595 : }
3596 :
3597 :
3598 : /* Exit a block (RETURN and LAST). */
3599 :
3600 : PERL_STATIC_INLINE void
3601 : Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
3602 : {
3603 : PERL_ARGS_ASSERT_CX_POPBLOCK;
3604 :
3605 : CX_DEBUG(cx, "POP");
3606 : /* these 3 are common to cx_popblock and cx_topblock */
3607 : PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
3608 : PL_scopestack_ix = cx->blk_oldscopesp;
3609 : PL_curpm = cx->blk_oldpm;
3610 :
3611 : /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
3612 : * and leaves a CX entry lying around for repeated use, so
3613 : * skip for multicall */ \
3614 : assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
3615 : || PL_savestack_ix == cx->blk_oldsaveix);
3616 : PL_curcop = cx->blk_oldcop;
3617 : PL_tmps_floor = cx->blk_old_tmpsfloor;
3618 : }
3619 :
3620 : /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
3621 : * Whereas cx_popblock() restores the state to the point just before
3622 : * cx_pushblock() was called, cx_topblock() restores it to the point just
3623 : * *after* cx_pushblock() was called. */
3624 :
3625 : PERL_STATIC_INLINE void
3626 : Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
3627 : {
3628 : PERL_ARGS_ASSERT_CX_TOPBLOCK;
3629 :
3630 : CX_DEBUG(cx, "TOP");
3631 : /* these 3 are common to cx_popblock and cx_topblock */
3632 : PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
3633 : PL_scopestack_ix = cx->blk_oldscopesp;
3634 : PL_curpm = cx->blk_oldpm;
3635 : Perl_rpp_popfree_to(aTHX_ PL_stack_base + cx->blk_oldsp);
3636 : }
3637 :
3638 :
3639 : PERL_STATIC_INLINE void
3640 : Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
3641 : {
3642 : U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
3643 :
3644 : PERL_ARGS_ASSERT_CX_PUSHSUB;
3645 :
3646 : PERL_DTRACE_PROBE_ENTRY(cv);
3647 : cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix;
3648 : PL_curstackinfo->si_cxsubix = (I32)(cx - PL_curstackinfo->si_cxstack);
3649 : cx->blk_sub.cv = cv;
3650 : cx->blk_sub.olddepth = CvDEPTH(cv);
3651 : cx->blk_sub.prevcomppad = PL_comppad;
3652 : cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
3653 : cx->blk_sub.retop = retop;
3654 : SvREFCNT_inc_simple_void_NN(cv);
3655 : cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
3656 : }
3657 :
3658 :
3659 : /* subsets of cx_popsub() */
3660 :
3661 : PERL_STATIC_INLINE void
3662 : Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
3663 : {
3664 : CV *cv;
3665 :
3666 : PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
3667 : assert(CxTYPE(cx) == CXt_SUB);
3668 :
3669 : PL_comppad = cx->blk_sub.prevcomppad;
3670 : PL_curpad = LIKELY(PL_comppad != NULL) ? AvARRAY(PL_comppad) : NULL;
3671 : cv = cx->blk_sub.cv;
3672 : CvDEPTH(cv) = cx->blk_sub.olddepth;
3673 : cx->blk_sub.cv = NULL;
3674 : SvREFCNT_dec(cv);
3675 : PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
3676 : }
3677 :
3678 :
3679 : /* handle the @_ part of leaving a sub */
3680 :
3681 : PERL_STATIC_INLINE void
3682 : Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
3683 : {
3684 : AV *av;
3685 :
3686 : PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
3687 : assert(CxTYPE(cx) == CXt_SUB);
3688 : assert(AvARRAY(MUTABLE_AV(
3689 : PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
3690 : CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
3691 :
3692 : CX_POP_SAVEARRAY(cx);
3693 : av = MUTABLE_AV(PAD_SVl(0));
3694 : if (!SvMAGICAL(av) && SvREFCNT(av) == 1
3695 : #ifndef PERL_RC_STACK
3696 : && !AvREAL(av)
3697 : #endif
3698 : )
3699 : clear_defarray_simple(av);
3700 : else
3701 : /* abandon @_ if it got reified */
3702 : clear_defarray(av, 0);
3703 : }
3704 :
3705 :
3706 : PERL_STATIC_INLINE void
3707 : Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
3708 : {
3709 : PERL_ARGS_ASSERT_CX_POPSUB;
3710 : assert(CxTYPE(cx) == CXt_SUB);
3711 :
3712 : PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
3713 :
3714 : if (CxHASARGS(cx))
3715 : cx_popsub_args(cx);
3716 : cx_popsub_common(cx);
3717 : }
3718 :
3719 :
3720 : PERL_STATIC_INLINE void
3721 : Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
3722 : {
3723 : PERL_ARGS_ASSERT_CX_PUSHFORMAT;
3724 :
3725 : cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
3726 : PL_curstackinfo->si_cxsubix= (I32)(cx - PL_curstackinfo->si_cxstack);
3727 : cx->blk_format.cv = cv;
3728 : cx->blk_format.retop = retop;
3729 : cx->blk_format.gv = gv;
3730 : cx->blk_format.dfoutgv = PL_defoutgv;
3731 : cx->blk_format.prevcomppad = PL_comppad;
3732 : cx->blk_u16 = 0;
3733 :
3734 : SvREFCNT_inc_simple_void_NN(cv);
3735 : CvDEPTH(cv)++;
3736 : SvREFCNT_inc_void(cx->blk_format.dfoutgv);
3737 : }
3738 :
3739 :
3740 : PERL_STATIC_INLINE void
3741 : Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
3742 : {
3743 : CV *cv;
3744 : GV *dfout;
3745 :
3746 : PERL_ARGS_ASSERT_CX_POPFORMAT;
3747 : assert(CxTYPE(cx) == CXt_FORMAT);
3748 :
3749 : dfout = cx->blk_format.dfoutgv;
3750 : setdefout(dfout);
3751 : cx->blk_format.dfoutgv = NULL;
3752 : SvREFCNT_dec_NN(dfout);
3753 :
3754 : PL_comppad = cx->blk_format.prevcomppad;
3755 : PL_curpad = LIKELY(PL_comppad != NULL) ? AvARRAY(PL_comppad) : NULL;
3756 : cv = cx->blk_format.cv;
3757 : cx->blk_format.cv = NULL;
3758 : --CvDEPTH(cv);
3759 : SvREFCNT_dec_NN(cv);
3760 : PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
3761 : }
3762 :
3763 :
3764 : PERL_STATIC_INLINE void
3765 : Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
3766 : {
3767 : cx->blk_eval.retop = retop;
3768 : cx->blk_eval.old_namesv = namesv;
3769 : cx->blk_eval.old_eval_root = PL_eval_root;
3770 : cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
3771 : cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
3772 : cx->blk_eval.cur_top_env = PL_top_env;
3773 :
3774 : assert(!(PL_in_eval & ~ 0x3F));
3775 : assert(!(PL_op->op_type & ~0x1FF));
3776 : cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
3777 : }
3778 :
3779 : PERL_STATIC_INLINE void
3780 : Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
3781 : {
3782 : PERL_ARGS_ASSERT_CX_PUSHEVAL;
3783 :
3784 : Perl_push_evalortry_common(aTHX_ cx, retop, namesv);
3785 :
3786 : cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
3787 : PL_curstackinfo->si_cxsubix = (I32)(cx - PL_curstackinfo->si_cxstack);
3788 : }
3789 :
3790 : PERL_STATIC_INLINE void
3791 : Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop)
3792 : {
3793 : PERL_ARGS_ASSERT_CX_PUSHTRY;
3794 :
3795 : Perl_push_evalortry_common(aTHX_ cx, retop, NULL);
3796 :
3797 : /* Don't actually change it, just store the current value so it's restored
3798 : * by the common popeval */
3799 : cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
3800 : }
3801 :
3802 :
3803 : PERL_STATIC_INLINE void
3804 : Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
3805 : {
3806 : SV *sv;
3807 :
3808 : PERL_ARGS_ASSERT_CX_POPEVAL;
3809 : assert(CxTYPE(cx) == CXt_EVAL);
3810 :
3811 : PL_in_eval = CxOLD_IN_EVAL(cx);
3812 : assert(!(PL_in_eval & 0xc0));
3813 : PL_eval_root = cx->blk_eval.old_eval_root;
3814 : sv = cx->blk_eval.cur_text;
3815 : if (sv && CxEVAL_TXT_REFCNTED(cx)) {
3816 : cx->blk_eval.cur_text = NULL;
3817 : SvREFCNT_dec_NN(sv);
3818 : }
3819 :
3820 : sv = cx->blk_eval.old_namesv;
3821 : if (sv) {
3822 : cx->blk_eval.old_namesv = NULL;
3823 : SvREFCNT_dec_NN(sv);
3824 : }
3825 : PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
3826 : }
3827 :
3828 :
3829 : /* push a plain loop, i.e.
3830 : * { block }
3831 : * while (cond) { block }
3832 : * for (init;cond;continue) { block }
3833 : * This loop can be last/redo'ed etc.
3834 : */
3835 :
3836 : PERL_STATIC_INLINE void
3837 : Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
3838 : {
3839 : PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
3840 : cx->blk_loop.my_op = cLOOP;
3841 : }
3842 :
3843 :
3844 : /* push a true for loop, i.e.
3845 : * for var (list) { block }
3846 : */
3847 :
3848 : PERL_STATIC_INLINE void
3849 : Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
3850 : {
3851 : PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
3852 :
3853 : /* this one line is common with cx_pushloop_plain */
3854 : cx->blk_loop.my_op = cLOOP;
3855 :
3856 : cx->blk_loop.itervar_u.svp = (SV**)itervarp;
3857 : cx->blk_loop.itersave = itersave;
3858 : #ifdef USE_ITHREADS
3859 : cx->blk_loop.oldcomppad = PL_comppad;
3860 : #endif
3861 : }
3862 :
3863 :
3864 : /* pop all loop types, including plain */
3865 :
3866 : PERL_STATIC_INLINE void
3867 : Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
3868 : {
3869 : PERL_ARGS_ASSERT_CX_POPLOOP;
3870 :
3871 : assert(CxTYPE_is_LOOP(cx));
3872 : if ( CxTYPE(cx) == CXt_LOOP_ARY
3873 : || CxTYPE(cx) == CXt_LOOP_LAZYSV)
3874 : {
3875 : /* Free ary or cur. This assumes that state_u.ary.ary
3876 : * aligns with state_u.lazysv.cur. See cx_dup() */
3877 : SV *sv = cx->blk_loop.state_u.lazysv.cur;
3878 : cx->blk_loop.state_u.lazysv.cur = NULL;
3879 : SvREFCNT_dec_NN(sv);
3880 : if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
3881 : sv = cx->blk_loop.state_u.lazysv.end;
3882 : cx->blk_loop.state_u.lazysv.end = NULL;
3883 : SvREFCNT_dec_NN(sv);
3884 : }
3885 : }
3886 : if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
3887 : SV *cursv;
3888 : SV **svp = (cx)->blk_loop.itervar_u.svp;
3889 : if ((cx->cx_type & CXp_FOR_GV))
3890 : svp = &GvSV((GV*)svp);
3891 : cursv = *svp;
3892 : *svp = cx->blk_loop.itersave;
3893 : cx->blk_loop.itersave = NULL;
3894 : SvREFCNT_dec(cursv);
3895 : }
3896 : if (cx->cx_type & (CXp_FOR_GV|CXp_FOR_LVREF))
3897 : SvREFCNT_dec(cx->blk_loop.itervar_u.svp);
3898 : }
3899 :
3900 :
3901 : PERL_STATIC_INLINE void
3902 : Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
3903 : {
3904 : PERL_ARGS_ASSERT_CX_PUSHWHEN;
3905 :
3906 : cx->blk_givwhen.leave_op = cLOGOP->op_other;
3907 : }
3908 :
3909 :
3910 : PERL_STATIC_INLINE void
3911 : Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
3912 : {
3913 : PERL_ARGS_ASSERT_CX_POPWHEN;
3914 : assert(CxTYPE(cx) == CXt_WHEN);
3915 :
3916 : PERL_UNUSED_ARG(cx);
3917 : PERL_UNUSED_CONTEXT;
3918 : /* currently NOOP */
3919 : }
3920 :
3921 :
3922 : PERL_STATIC_INLINE void
3923 : Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
3924 : {
3925 : PERL_ARGS_ASSERT_CX_PUSHGIVEN;
3926 :
3927 : cx->blk_givwhen.leave_op = cLOGOP->op_other;
3928 : cx->blk_givwhen.defsv_save = orig_defsv;
3929 : }
3930 :
3931 :
3932 : PERL_STATIC_INLINE void
3933 : Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
3934 : {
3935 : SV *sv;
3936 :
3937 : PERL_ARGS_ASSERT_CX_POPGIVEN;
3938 : assert(CxTYPE(cx) == CXt_GIVEN);
3939 :
3940 : sv = GvSV(PL_defgv);
3941 : GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
3942 : cx->blk_givwhen.defsv_save = NULL;
3943 : SvREFCNT_dec(sv);
3944 : }
3945 :
3946 :
3947 : /* Make @_ empty in-place in simple cases: a cheap av_clear().
3948 : * See Perl_clear_defarray() for non-simple cases */
3949 :
3950 :
3951 : PERL_STATIC_INLINE void
3952 : Perl_clear_defarray_simple(pTHX_ AV *av)
3953 : {
3954 : PERL_ARGS_ASSERT_CLEAR_DEFARRAY_SIMPLE;
3955 :
3956 : assert(SvTYPE(av) == SVt_PVAV);
3957 : assert(!SvREADONLY(av));
3958 : assert(!SvMAGICAL(av));
3959 : assert(SvREFCNT(av) == 1);
3960 :
3961 : #ifdef PERL_RC_STACK
3962 : assert(AvREAL(av));
3963 : /* this code assumes that destructors called here can't free av
3964 : * itself, because pad[0] and/or CX pointers will keep it alive */
3965 : SSize_t i = AvFILLp(av);
3966 : while (i >= 0) {
3967 : SV *sv = AvARRAY(av)[i];
3968 : AvARRAY(av)[i--] = NULL;
3969 : SvREFCNT_dec(sv);
3970 : }
3971 : #else
3972 : assert(!AvREAL(av));
3973 : #endif
3974 : AvFILLp(av) = -1;
3975 : Perl_av_remove_offset(aTHX_ av);
3976 : }
3977 :
3978 : /* Switch to a different argument stack.
3979 : *
3980 : * Note that it doesn't update PL_curstackinfo->si_stack_nonrc_base,
3981 : * so this should only be used as part of a general switching between
3982 : * stackinfos.
3983 : */
3984 :
3985 : PERL_STATIC_INLINE void
3986 : Perl_switch_argstack(pTHX_ AV *to)
3987 : {
3988 : PERL_ARGS_ASSERT_SWITCH_ARGSTACK;
3989 :
3990 : AvFILLp(PL_curstack) = PL_stack_sp - PL_stack_base;
3991 : PL_stack_base = AvARRAY(to);
3992 : PL_stack_max = PL_stack_base + AvMAX(to);
3993 : PL_stack_sp = PL_stack_base + AvFILLp(to);
3994 : PL_curstack = to;
3995 : }
3996 :
3997 :
3998 : /* Push, and switch to a new stackinfo, allocating one if none are spare,
3999 : * to get a fresh set of stacks.
4000 : * Update all the interpreter variables like PL_curstackinfo,
4001 : * PL_stack_sp, etc.
4002 : * current flag meanings:
4003 : * 1 make the new arg stack AvREAL
4004 : */
4005 :
4006 :
4007 : PERL_STATIC_INLINE void
4008 : Perl_push_stackinfo(pTHX_ I32 type, UV flags)
4009 : {
4010 : PERL_ARGS_ASSERT_PUSH_STACKINFO;
4011 :
4012 : PERL_SI *next = PL_curstackinfo->si_next;
4013 : DEBUG_l({
4014 : int i = 0; PERL_SI *p = PL_curstackinfo;
4015 : while (p) { i++; p = p->si_prev; }
4016 : Perl_deb(aTHX_ "push STACKINFO %d in %s at %s:%d\n",
4017 : i, SAFE_FUNCTION__, __FILE__, __LINE__);
4018 : })
4019 :
4020 : if (!next) {
4021 : next = new_stackinfo_flags(32, 2048/sizeof(PERL_CONTEXT) - 1, flags);
4022 : next->si_prev = PL_curstackinfo;
4023 : PL_curstackinfo->si_next = next;
4024 : }
4025 : next->si_type = type;
4026 : next->si_cxix = -1;
4027 : next->si_cxsubix = -1;
4028 : PUSHSTACK_INIT_HWM(next);
4029 : #ifdef PERL_RC_STACK
4030 : next->si_stack_nonrc_base = 0;
4031 : #endif
4032 : if (flags & 1)
4033 : AvREAL_on(next->si_stack);
4034 : else
4035 : AvREAL_off(next->si_stack);
4036 : AvFILLp(next->si_stack) = 0;
4037 : switch_argstack(next->si_stack);
4038 : PL_curstackinfo = next;
4039 : SET_MARK_OFFSET;
4040 : }
4041 :
4042 :
4043 : /* Pop, then switch to the previous stackinfo and set of stacks.
4044 : * Update all the interpreter variables like PL_curstackinfo,
4045 : * PL_stack_sp, etc. */
4046 :
4047 : PERL_STATIC_INLINE void
4048 : Perl_pop_stackinfo(pTHX)
4049 : {
4050 : PERL_ARGS_ASSERT_POP_STACKINFO;
4051 :
4052 : PERL_SI * const prev = PL_curstackinfo->si_prev;
4053 : DEBUG_l({
4054 : int i = -1; PERL_SI *p = PL_curstackinfo;
4055 : while (p) { i++; p = p->si_prev; }
4056 : Perl_deb(aTHX_ "pop STACKINFO %d in %s at %s:%d\n",
4057 : i, SAFE_FUNCTION__, __FILE__, __LINE__);})
4058 : if (!prev) {
4059 : Perl_croak_popstack();
4060 : }
4061 :
4062 : switch_argstack(prev->si_stack);
4063 : /* don't free prev here, free them all at the END{} */
4064 : PL_curstackinfo = prev;
4065 : }
4066 :
4067 :
4068 :
4069 : /*
4070 : =for apidoc newPADxVOP
4071 :
4072 : Constructs, checks and returns an op containing a pad offset. C<type> is
4073 : the opcode, which should be one of C<OP_PADSV>, C<OP_PADAV>, C<OP_PADHV>
4074 : or C<OP_PADCV>. The returned op will have the C<op_targ> field set by
4075 : the C<padix> argument.
4076 :
4077 : This is convenient when constructing a large optree in nested function
4078 : calls, as it avoids needing to store the pad op directly to set the
4079 : C<op_targ> field as a side-effect. For example
4080 :
4081 : o = op_append_elem(OP_LINESEQ, o,
4082 : newPADxVOP(OP_PADSV, 0, padix));
4083 :
4084 : =cut
4085 : */
4086 :
4087 : PERL_STATIC_INLINE OP *
4088 : Perl_newPADxVOP(pTHX_ I32 type, I32 flags, PADOFFSET padix)
4089 : {
4090 : PERL_ARGS_ASSERT_NEWPADXVOP;
4091 :
4092 : assert(type == OP_PADSV || type == OP_PADAV || type == OP_PADHV
4093 : || type == OP_PADCV);
4094 : OP *o = newOP(type, flags);
4095 : o->op_targ = padix;
4096 : return o;
4097 : }
4098 :
4099 : /* ------------------ util.h ------------------------------------------- */
4100 :
4101 : /*
4102 : =for apidoc_section $string
4103 :
4104 : =for apidoc foldEQ
4105 :
4106 : Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
4107 : same
4108 : case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
4109 : match themselves and their opposite case counterparts. Non-cased and non-ASCII
4110 : range bytes match only themselves.
4111 :
4112 : =cut
4113 : */
4114 :
4115 : PERL_STATIC_INLINE I32
4116 : Perl_foldEQ(pTHX_ const char *s1, const char *s2, I32 len)
4117 : {
4118 : PERL_UNUSED_CONTEXT;
4119 :
4120 : const U8 *a = (const U8 *)s1;
4121 : const U8 *b = (const U8 *)s2;
4122 :
4123 : PERL_ARGS_ASSERT_FOLDEQ;
4124 :
4125 : assert(len >= 0);
4126 :
4127 : while (len--) {
4128 : if (*a != *b && *a != PL_fold[*b])
4129 : return 0;
4130 : a++,b++;
4131 : }
4132 : return 1;
4133 : }
4134 :
4135 : PERL_STATIC_INLINE I32
4136 : Perl_foldEQ_latin1(pTHX_ const char *s1, const char *s2, I32 len)
4137 : {
4138 : /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
4139 : * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
4140 : * does not check for this. Nor does it check that the strings each have
4141 : * at least 'len' characters. */
4142 :
4143 : PERL_UNUSED_CONTEXT;
4144 :
4145 : const U8 *a = (const U8 *)s1;
4146 : const U8 *b = (const U8 *)s2;
4147 :
4148 : PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
4149 :
4150 : assert(len >= 0);
4151 :
4152 : while (len--) {
4153 : if (*a != *b && *a != PL_fold_latin1[*b]) {
4154 : return 0;
4155 : }
4156 : a++, b++;
4157 : }
4158 : return 1;
4159 : }
4160 :
4161 : /*
4162 : =for apidoc_section $locale
4163 : =for apidoc foldEQ_locale
4164 :
4165 : Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
4166 : same case-insensitively in the current locale; false otherwise.
4167 :
4168 : =cut
4169 : */
4170 :
4171 : PERL_STATIC_INLINE I32
4172 : Perl_foldEQ_locale(pTHX_ const char *s1, const char *s2, I32 len)
4173 : {
4174 : const U8 *a = (const U8 *)s1;
4175 : const U8 *b = (const U8 *)s2;
4176 :
4177 : PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
4178 :
4179 : assert(len >= 0);
4180 :
4181 : while (len--) {
4182 : if (*a != *b && *a != PL_fold_locale[*b]) {
4183 : DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4184 : "%s:%d: Our records indicate %02x is not a fold of %02x"
4185 : " or its mate %02x\n",
4186 : __FILE__, __LINE__, *a, *b, PL_fold_locale[*b]));
4187 :
4188 : return 0;
4189 : }
4190 : a++,b++;
4191 : }
4192 : return 1;
4193 : }
4194 :
4195 : /*
4196 : =for apidoc_section $string
4197 : =for apidoc my_strnlen
4198 :
4199 : The C library C<strnlen> if available, or a Perl implementation of it.
4200 :
4201 : C<my_strnlen()> computes the length of the string, up to C<maxlen>
4202 : bytes. It will never attempt to address more than C<maxlen>
4203 : bytes, making it suitable for use with strings that are not
4204 : guaranteed to be NUL-terminated.
4205 :
4206 : =cut
4207 :
4208 : Description stolen from http://man.openbsd.org/strnlen.3,
4209 : implementation stolen from PostgreSQL.
4210 : */
4211 : #ifndef HAS_STRNLEN
4212 :
4213 : PERL_STATIC_INLINE Size_t
4214 : Perl_my_strnlen(const char *str, Size_t maxlen)
4215 : {
4216 : const char *end = (char *) memchr(str, '\0', maxlen);
4217 :
4218 : PERL_ARGS_ASSERT_MY_STRNLEN;
4219 :
4220 : if (end == NULL) return maxlen;
4221 : return end - str;
4222 : }
4223 :
4224 : #endif
4225 :
4226 : #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
4227 :
4228 : PERL_STATIC_INLINE void *
4229 : S_my_memrchr(const char * s, const char c, const STRLEN len)
4230 : {
4231 : /* memrchr(), since many platforms lack it */
4232 :
4233 : const char * t = s + len - 1;
4234 :
4235 : PERL_ARGS_ASSERT_MY_MEMRCHR;
4236 :
4237 : while (t >= s) {
4238 : if (*t == c) {
4239 : return (void *) t;
4240 : }
4241 : t--;
4242 : }
4243 :
4244 : return NULL;
4245 : }
4246 :
4247 : #endif
4248 :
4249 : PERL_STATIC_INLINE char *
4250 : Perl_mortal_getenv(const char * str)
4251 : {
4252 : /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
4253 : *
4254 : * It's (mostly) thread-safe because it uses a mutex to prevent other
4255 : * threads (that look at this mutex) from destroying the result before this
4256 : * routine has a chance to copy the result to a place that won't be
4257 : * destroyed before the caller gets a chance to handle it. That place is a
4258 : * mortal SV. khw chose this over SAVEFREEPV because he is under the
4259 : * impression that the SV will hang around longer under more circumstances
4260 : *
4261 : * The reason it isn't completely thread-safe is that other code could
4262 : * simply not pay attention to the mutex. All of the Perl core uses the
4263 : * mutex, but it is possible for code from, say XS, to not use this mutex,
4264 : * defeating the safety.
4265 : *
4266 : * getenv() returns, in some implementations, a pointer to a spot in the
4267 : * **environ array, which could be invalidated at any time by this or
4268 : * another thread changing the environment. Other implementations copy the
4269 : * **environ value to a static buffer, returning a pointer to that. That
4270 : * buffer might or might not be invalidated by a getenv() call in another
4271 : * thread. If it does get zapped, we need an exclusive lock. Otherwise,
4272 : * many getenv() calls can safely be running simultaneously, so a
4273 : * many-reader (but no simultaneous writers) lock is ok. There is a
4274 : * Configure probe to see if another thread destroys the buffer, and the
4275 : * mutex is defined accordingly.
4276 : *
4277 : * But in all cases, using the mutex prevents these problems, as long as
4278 : * all code uses the same mutex.
4279 : *
4280 : * A complication is that this can be called during phases where the
4281 : * mortalization process isn't available. These are in interpreter
4282 : * destruction or early in construction. khw believes that at these times
4283 : * there shouldn't be anything else going on, so plain getenv is safe AS
4284 : * LONG AS the caller acts on the return before calling it again. */
4285 :
4286 : char * ret;
4287 : dTHX;
4288 :
4289 : PERL_ARGS_ASSERT_MORTAL_GETENV;
4290 :
4291 : /* Can't mortalize without stacks. khw believes that no other threads
4292 : * should be running, so no need to lock things, and this may be during a
4293 : * phase when locking isn't even available */
4294 : if (UNLIKELY(PL_scopestack_ix == 0)) {
4295 : return getenv(str);
4296 : }
4297 :
4298 : #ifdef PERL_MEM_LOG
4299 :
4300 : /* A major complication arises under PERL_MEM_LOG. When that is active,
4301 : * every memory allocation may result in logging, depending on the value of
4302 : * ENV{PERL_MEM_LOG} at the moment. That means, as we create the SV for
4303 : * saving ENV{foo}'s value (but before saving it), the logging code will
4304 : * call us recursively to find out what ENV{PERL_MEM_LOG} is. Without some
4305 : * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to
4306 : * lock a boolean mutex recursively); 3) destroying the getenv() static
4307 : * buffer; or 4) destroying the temporary created by this for the copy
4308 : * causes a log entry to be made which could cause a new temporary to be
4309 : * created, which will need to be destroyed at some point, leading to an
4310 : * infinite loop.
4311 : *
4312 : * The solution adopted here (after some gnashing of teeth) is to detect
4313 : * the recursive calls and calls from the logger, and treat them specially.
4314 : * Let's say we want to do getenv("foo"). We first find
4315 : * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter
4316 : * variable, so no temporary is required. Then we do getenv(foo), and in
4317 : * the process of creating a temporary to save it, this function will be
4318 : * called recursively to do a getenv(PERL_MEM_LOG). On the recursed call,
4319 : * we detect that it is such a call and return our saved value instead of
4320 : * locking and doing a new getenv(). This solves all of problems 1), 2),
4321 : * and 3). Because all the getenv()s are done while the mutex is locked,
4322 : * the state cannot have changed. To solve 4), we don't create a temporary
4323 : * when this is called from the logging code. That code disposes of the
4324 : * return value while the mutex is still locked.
4325 : *
4326 : * The value of getenv(PERL_MEM_LOG) can be anything, but only initial
4327 : * digits and 3 particular letters are significant; the rest are ignored by
4328 : * the memory logging code. Thus the per-interpreter variable only needs
4329 : * to be large enough to save the significant information, the size of
4330 : * which is known at compile time. The first byte is extra, reserved for
4331 : * flags for our use. To protect against overflowing, only the reserved
4332 : * byte, as many digits as don't overflow, and the three letters are
4333 : * stored.
4334 : *
4335 : * The reserved byte has two bits:
4336 : * 0x1 if set indicates that if we get here, it is a recursive call of
4337 : * getenv()
4338 : * 0x2 if set indicates that the call is from the logging code.
4339 : *
4340 : * If the flag indicates this is a recursive call, just return the stored
4341 : * value of PL_mem_log; An empty value gets turned into NULL. */
4342 : if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) {
4343 : if (PL_mem_log[1] == '\0') {
4344 : return NULL;
4345 : } else {
4346 : return PL_mem_log + 1;
4347 : }
4348 : }
4349 :
4350 : #endif
4351 :
4352 : GETENV_LOCK;
4353 :
4354 : #ifdef PERL_MEM_LOG
4355 :
4356 : /* Here we are in a critical section. As explained above, we do our own
4357 : * getenv(PERL_MEM_LOG), saving the result safely. */
4358 : ret = getenv("PERL_MEM_LOG");
4359 : if (ret == NULL) { /* No logging active */
4360 :
4361 : /* Return that immediately if called from the logging code */
4362 : if (PL_mem_log[0] & 0x2) {
4363 : GETENV_UNLOCK;
4364 : return NULL;
4365 : }
4366 :
4367 : PL_mem_log[1] = '\0';
4368 : }
4369 : else {
4370 : char *mem_log_meat = PL_mem_log + 1; /* first byte reserved */
4371 :
4372 : /* There is nothing to prevent the value of PERL_MEM_LOG from being an
4373 : * extremely long string. But we want only a few characters from it.
4374 : * PL_mem_log has been made large enough to hold just the ones we need.
4375 : * First the file descriptor. */
4376 : if (isDIGIT(*ret)) {
4377 : const char * s = ret;
4378 : if (UNLIKELY(*s == '0')) {
4379 :
4380 : /* Reduce multiple leading zeros to a single one. This is to
4381 : * allow the caller to change what to do with leading zeros. */
4382 : *mem_log_meat++ = '0';
4383 : s++;
4384 : while (*s == '0') {
4385 : s++;
4386 : }
4387 : }
4388 :
4389 : /* If the input overflows, copy just enough for the result to also
4390 : * overflow, plus 1 to make sure */
4391 : while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) {
4392 : *mem_log_meat++ = *s++;
4393 : }
4394 : }
4395 :
4396 : /* Then each of the four significant characters */
4397 : if (strchr(ret, 'm')) {
4398 : *mem_log_meat++ = 'm';
4399 : }
4400 : if (strchr(ret, 's')) {
4401 : *mem_log_meat++ = 's';
4402 : }
4403 : if (strchr(ret, 't')) {
4404 : *mem_log_meat++ = 't';
4405 : }
4406 : if (strchr(ret, 'c')) {
4407 : *mem_log_meat++ = 'c';
4408 : }
4409 : *mem_log_meat = '\0';
4410 :
4411 : assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log));
4412 : }
4413 :
4414 : /* If we are being called from the logger, it only needs the significant
4415 : * portion of PERL_MEM_LOG, and doesn't need a safe copy */
4416 : if (PL_mem_log[0] & 0x2) {
4417 : assert(strEQ(str, "PERL_MEM_LOG"));
4418 : GETENV_UNLOCK;
4419 : return PL_mem_log + 1;
4420 : }
4421 :
4422 : /* Here is a generic getenv(). This could be a getenv("PERL_MEM_LOG") that
4423 : * is coming from other than the logging code, so it should be treated the
4424 : * same as any other getenv(), returning the full value, not just the
4425 : * significant part, and having its value saved. Set the flag that
4426 : * indicates any call to this routine will be a recursion from here */
4427 : PL_mem_log[0] = 0x1;
4428 :
4429 : #endif
4430 :
4431 : /* Now get the value of the real desired variable, and save a copy */
4432 : ret = getenv(str);
4433 :
4434 : if (ret != NULL) {
4435 : ret = SvPVX( newSVpvn_flags(ret, strlen(ret) ,SVs_TEMP) );
4436 : }
4437 :
4438 : GETENV_UNLOCK;
4439 :
4440 : #ifdef PERL_MEM_LOG
4441 :
4442 : /* Clear the buffer */
4443 : Zero(PL_mem_log, sizeof(PL_mem_log), char);
4444 :
4445 : #endif
4446 :
4447 : return ret;
4448 : }
4449 :
4450 : PERL_STATIC_INLINE bool
4451 : Perl_sv_isbool(pTHX_ const SV *sv)
4452 : {
4453 : PERL_UNUSED_CONTEXT;
4454 : return SvBoolFlagsOK(sv) && BOOL_INTERNALS_sv_isbool(sv);
4455 : }
4456 :
4457 : #ifdef USE_ITHREADS
4458 :
4459 : PERL_STATIC_INLINE AV *
4460 : Perl_cop_file_avn(pTHX_ const COP *cop) {
4461 :
4462 : PERL_ARGS_ASSERT_COP_FILE_AVN;
4463 :
4464 : const char *file = CopFILE(cop);
4465 : if (file) {
4466 : GV *gv = gv_fetchfile_flags(file, strlen(file), GVF_NOADD);
4467 : if (gv) {
4468 : return GvAVn(gv);
4469 : }
4470 : else
4471 : return NULL;
4472 : }
4473 : else
4474 : return NULL;
4475 : }
4476 :
4477 : #endif
4478 :
4479 : PERL_STATIC_INLINE PADNAME *
4480 : Perl_padname_refcnt_inc(PADNAME *pn)
4481 : {
4482 : PadnameREFCNT(pn)++;
4483 : return pn;
4484 : }
4485 :
4486 : PERL_STATIC_INLINE PADNAMELIST *
4487 : Perl_padnamelist_refcnt_inc(PADNAMELIST *pnl)
4488 : {
4489 : PadnamelistREFCNT(pnl)++;
4490 : return pnl;
4491 : }
4492 :
4493 : /* copy a string to a safe spot */
4494 :
4495 : /*
4496 : =for apidoc_section $string
4497 : =for apidoc savepv
4498 :
4499 : Perl's version of C<strdup()>. Returns a pointer to a newly allocated
4500 : string which is a duplicate of C<pv>. The size of the string is
4501 : determined by C<strlen()>, which means it may not contain embedded C<NUL>
4502 : characters and must have a trailing C<NUL>. To prevent memory leaks, the
4503 : memory allocated for the new string needs to be freed when no longer needed.
4504 : This can be done with the C<L</Safefree>> function, or
4505 : L<C<SAVEFREEPV>|perlguts/SAVEFREEPV(p)>.
4506 :
4507 : On some platforms, Windows for example, all allocated memory owned by a thread
4508 : is deallocated when that thread ends. So if you need that not to happen, you
4509 : need to use the shared memory functions, such as C<L</savesharedpv>>.
4510 :
4511 : =cut
4512 : */
4513 :
4514 : PERL_STATIC_INLINE char *
4515 : Perl_savepv(pTHX_ const char *pv)
4516 : {
4517 : PERL_UNUSED_CONTEXT;
4518 : if (!pv)
4519 : return NULL;
4520 : else {
4521 : char *newaddr;
4522 : const STRLEN pvlen = strlen(pv)+1;
4523 : Newx(newaddr, pvlen, char);
4524 : return (char*)memcpy(newaddr, pv, pvlen);
4525 : }
4526 : }
4527 :
4528 : /* same thing but with a known length */
4529 :
4530 : /*
4531 : =for apidoc savepvn
4532 :
4533 : Perl's version of what C<strndup()> would be if it existed. Returns a
4534 : pointer to a newly allocated string which is a duplicate of the first
4535 : C<len> bytes from C<pv>, plus a trailing
4536 : C<NUL> byte. The memory allocated for
4537 : the new string can be freed with the C<Safefree()> function.
4538 :
4539 : On some platforms, Windows for example, all allocated memory owned by a thread
4540 : is deallocated when that thread ends. So if you need that not to happen, you
4541 : need to use the shared memory functions, such as C<L</savesharedpvn>>.
4542 :
4543 : =cut
4544 : */
4545 :
4546 : PERL_STATIC_INLINE char *
4547 : Perl_savepvn(pTHX_ const char *pv, Size_t len)
4548 : {
4549 : char *newaddr;
4550 : PERL_UNUSED_CONTEXT;
4551 :
4552 : Newx(newaddr,len+1,char);
4553 : /* Give a meaning to NULL pointer mainly for the use in sv_magic() */
4554 : if (pv) {
4555 : /* might not be null terminated */
4556 : newaddr[len] = '\0';
4557 : return (char *) CopyD(pv,newaddr,len,char);
4558 : }
4559 : else {
4560 : return (char *) ZeroD(newaddr,len+1,char);
4561 : }
4562 : }
4563 :
4564 : /*
4565 : =for apidoc savesvpv
4566 :
4567 : A version of C<savepv()>/C<savepvn()> which gets the string to duplicate from
4568 : the passed in SV using C<SvPV()>
4569 :
4570 : On some platforms, Windows for example, all allocated memory owned by a thread
4571 : is deallocated when that thread ends. So if you need that not to happen, you
4572 : need to use the shared memory functions, such as C<L</savesharedsvpv>>.
4573 :
4574 : =cut
4575 : */
4576 :
4577 : PERL_STATIC_INLINE char *
4578 : Perl_savesvpv(pTHX_ SV *sv)
4579 : {
4580 : STRLEN len;
4581 : const char * const pv = SvPV_const(sv, len);
4582 : char *newaddr;
4583 :
4584 : PERL_ARGS_ASSERT_SAVESVPV;
4585 :
4586 : ++len;
4587 : Newx(newaddr,len,char);
4588 : return (char *) CopyD(pv,newaddr,len,char);
4589 : }
4590 :
4591 : /*
4592 : =for apidoc savesharedsvpv
4593 :
4594 : A version of C<savesharedpv()> which allocates the duplicate string in
4595 : memory which is shared between threads.
4596 :
4597 : =cut
4598 : */
4599 :
4600 : PERL_STATIC_INLINE char *
4601 : Perl_savesharedsvpv(pTHX_ SV *sv)
4602 : {
4603 : STRLEN len;
4604 : const char * const pv = SvPV_const(sv, len);
4605 :
4606 : PERL_ARGS_ASSERT_SAVESHAREDSVPV;
4607 :
4608 : return savesharedpvn(pv, len);
4609 : }
4610 :
4611 : #ifndef PERL_GET_CONTEXT_DEFINED
4612 :
4613 : /*
4614 : =for apidoc_section $embedding
4615 : =for apidoc get_context
4616 :
4617 : Implements L<perlapi/C<PERL_GET_CONTEXT>>, which you should use instead.
4618 :
4619 : =cut
4620 : */
4621 :
4622 : PERL_STATIC_INLINE void *
4623 : Perl_get_context(void)
4624 : {
4625 : # if defined(USE_ITHREADS)
4626 : # ifdef OLD_PTHREADS_API
4627 : pthread_addr_t t;
4628 : int error = pthread_getspecific(PL_thr_key, &t);
4629 : if (error)
4630 : Perl_croak_nocontext("panic: pthread_getspecific, error=%d", error);
4631 : return (void*)t;
4632 : # elif defined(I_MACH_CTHREADS)
4633 : return (void*)cthread_data(cthread_self());
4634 : # else
4635 : return (void*)PTHREAD_GETSPECIFIC(PL_thr_key);
4636 : # endif
4637 : # else
4638 : return (void*)NULL;
4639 : # endif
4640 : }
4641 :
4642 : #endif
4643 :
4644 : PERL_STATIC_INLINE MGVTBL*
4645 : Perl_get_vtbl(pTHX_ int vtbl_id)
4646 : {
4647 : PERL_UNUSED_CONTEXT;
4648 :
4649 : return (vtbl_id < 0 || vtbl_id >= magic_vtable_max)
4650 : ? NULL : (MGVTBL*)PL_magic_vtables + vtbl_id;
4651 : }
4652 :
4653 : /*
4654 : =for apidoc my_strlcat
4655 :
4656 : The C library C<strlcat> if available, or a Perl implementation of it.
4657 : This operates on C C<NUL>-terminated strings.
4658 :
4659 : C<my_strlcat()> appends string C<src> to the end of C<dst>. It will append at
4660 : most S<C<size - strlen(dst) - 1>> bytes. It will then C<NUL>-terminate,
4661 : unless C<size> is 0 or the original C<dst> string was longer than C<size> (in
4662 : practice this should not happen as it means that either C<size> is incorrect or
4663 : that C<dst> is not a proper C<NUL>-terminated string).
4664 :
4665 : Note that C<size> is the full size of the destination buffer and
4666 : the result is guaranteed to be C<NUL>-terminated if there is room. Note that
4667 : room for the C<NUL> should be included in C<size>.
4668 :
4669 : The return value is the total length that C<dst> would have if C<size> is
4670 : sufficiently large. Thus it is the initial length of C<dst> plus the length of
4671 : C<src>. If C<size> is smaller than the return, the excess was not appended.
4672 :
4673 : =cut
4674 :
4675 : Description stolen from http://man.openbsd.org/strlcat.3
4676 : */
4677 : #ifndef HAS_STRLCAT
4678 : PERL_STATIC_INLINE Size_t
4679 : Perl_my_strlcat(char *dst, const char *src, Size_t size)
4680 : {
4681 : Size_t used, length, copy;
4682 :
4683 : used = strlen(dst);
4684 : length = strlen(src);
4685 : if (size > 0 && used < size - 1) {
4686 : copy = (length >= size - used) ? size - used - 1 : length;
4687 : memcpy(dst + used, src, copy);
4688 : dst[used + copy] = '\0';
4689 : }
4690 : return used + length;
4691 : }
4692 : #endif
4693 :
4694 :
4695 : /*
4696 : =for apidoc my_strlcpy
4697 :
4698 : The C library C<strlcpy> if available, or a Perl implementation of it.
4699 : This operates on C C<NUL>-terminated strings.
4700 :
4701 : C<my_strlcpy()> copies up to S<C<size - 1>> bytes from the string C<src>
4702 : to C<dst>, C<NUL>-terminating the result if C<size> is not 0.
4703 :
4704 : The return value is the total length C<src> would be if the copy completely
4705 : succeeded. If it is larger than C<size>, the excess was not copied.
4706 :
4707 : =cut
4708 :
4709 : Description stolen from http://man.openbsd.org/strlcpy.3
4710 : */
4711 : #ifndef HAS_STRLCPY
4712 : PERL_STATIC_INLINE Size_t
4713 : Perl_my_strlcpy(char *dst, const char *src, Size_t size)
4714 : {
4715 : Size_t length, copy;
4716 :
4717 : length = strlen(src);
4718 : if (size > 0) {
4719 : copy = (length >= size) ? size - 1 : length;
4720 : memcpy(dst, src, copy);
4721 : dst[copy] = '\0';
4722 : }
4723 : return length;
4724 : }
4725 : #endif
4726 :
4727 : /*
4728 : * ex: set ts=8 sts=4 sw=4 et:
4729 : */
|