|
|
Fix a double free / segfault with utf8 regexps
|
|
|
Debian #454792
|
|
|
[rt.cpan.org #48156]
|
|
|
[rt.cpan.org #40641]
|
|
|
upstream change 29204
|
|
|
|
|
|
UTF8_ALLOW_DEFAULT definition in utf8.h picked from upstream change 27688
|
|
|
|
|
|
diff --git a/embed.fnc b/embed.fnc
|
|
|
index edfbc0e..26524c7 100644
|
|
|
--- a/embed.fnc
|
|
|
+++ b/embed.fnc
|
|
|
@@ -1168,6 +1168,7 @@ Es |void |reguni |NN const struct RExC_state_t *state|UV uv|NN char *s|NN STRLE
|
|
|
Es |regnode*|regclass |NN struct RExC_state_t *state
|
|
|
ERs |I32 |regcurly |NN const char *
|
|
|
Es |regnode*|reg_node |NN struct RExC_state_t *state|U8 op
|
|
|
+Es |UV |reg_recode |const char value|NULLOK SV **encp
|
|
|
Es |regnode*|regpiece |NN struct RExC_state_t *state|NN I32 *flagp
|
|
|
Es |void |reginsert |NN struct RExC_state_t *state|U8 op|NN regnode *opnd
|
|
|
Es |void |regoptail |NN struct RExC_state_t *state|NN regnode *p|NN regnode *val
|
|
|
diff --git a/embed.h b/embed.h
|
|
|
index 2b38fd5..372b04f 100644
|
|
|
--- a/embed.h
|
|
|
+++ b/embed.h
|
|
|
@@ -1234,6 +1234,7 @@
|
|
|
#define regclass S_regclass
|
|
|
#define regcurly S_regcurly
|
|
|
#define reg_node S_reg_node
|
|
|
+#define reg_recode S_reg_recode
|
|
|
#define regpiece S_regpiece
|
|
|
#define reginsert S_reginsert
|
|
|
#define regoptail S_regoptail
|
|
|
@@ -3277,6 +3278,7 @@
|
|
|
#define regclass(a) S_regclass(aTHX_ a)
|
|
|
#define regcurly(a) S_regcurly(aTHX_ a)
|
|
|
#define reg_node(a,b) S_reg_node(aTHX_ a,b)
|
|
|
+#define reg_recode(a,b) S_reg_recode(aTHX_ a,b)
|
|
|
#define regpiece(a,b) S_regpiece(aTHX_ a,b)
|
|
|
#define reginsert(a,b,c) S_reginsert(aTHX_ a,b,c)
|
|
|
#define regoptail(a,b,c) S_regoptail(aTHX_ a,b,c)
|
|
|
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
|
|
|
index 9b3134c..7d95216 100644
|
|
|
--- a/pod/perldiag.pod
|
|
|
+++ b/pod/perldiag.pod
|
|
|
@@ -1900,6 +1900,15 @@ recognized by Perl or by a user-supplied handler. See L<attributes>.
|
|
|
(W printf) Perl does not understand the given format conversion. See
|
|
|
L<perlfunc/sprintf>.
|
|
|
|
|
|
+=item Invalid escape in the specified encoding in regex; marked by <-- HERE in m/%s/
|
|
|
+
|
|
|
+(W regexp) The numeric escape (for example C<\xHH>) of value < 256
|
|
|
+didn't correspond to a single character through the conversion
|
|
|
+from the encoding specified by the encoding pragma.
|
|
|
+The escape was replaced with REPLACEMENT CHARACTER (U+FFFD) instead.
|
|
|
+The <-- HERE shows in the regular expression about where the
|
|
|
+escape was discovered.
|
|
|
+
|
|
|
=item Invalid [] range "%s" in regex; marked by <-- HERE in m/%s/
|
|
|
|
|
|
(F) The range specified in a character class had a minimum character
|
|
|
diff --git a/proto.h b/proto.h
|
|
|
index 6d185dd..ef6c0cf 100644
|
|
|
--- a/proto.h
|
|
|
+++ b/proto.h
|
|
|
@@ -1748,6 +1748,7 @@ STATIC I32 S_regcurly(pTHX_ const char *)
|
|
|
__attribute__warn_unused_result__;
|
|
|
|
|
|
STATIC regnode* S_reg_node(pTHX_ struct RExC_state_t *state, U8 op);
|
|
|
+STATIC UV S_reg_recode(pTHX_ const char value, SV **encp);
|
|
|
STATIC regnode* S_regpiece(pTHX_ struct RExC_state_t *state, I32 *flagp);
|
|
|
STATIC void S_reginsert(pTHX_ struct RExC_state_t *state, U8 op, regnode *opnd);
|
|
|
STATIC void S_regoptail(pTHX_ struct RExC_state_t *state, regnode *p, regnode *val);
|
|
|
diff --git a/regcomp.c b/regcomp.c
|
|
|
index 928cf39..98d48dd 100644
|
|
|
--- a/regcomp.c
|
|
|
+++ b/regcomp.c
|
|
|
@@ -2791,6 +2791,39 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
+ * reg_recode
|
|
|
+ *
|
|
|
+ * It returns the code point in utf8 for the value in *encp.
|
|
|
+ * value: a code value in the source encoding
|
|
|
+ * encp: a pointer to an Encode object
|
|
|
+ *
|
|
|
+ * If the result from Encode is not a single character,
|
|
|
+ * it returns U+FFFD (Replacement character) and sets *encp to NULL.
|
|
|
+ */
|
|
|
+STATIC UV
|
|
|
+S_reg_recode(pTHX_ const char value, SV **encp)
|
|
|
+{
|
|
|
+ STRLEN numlen = 1;
|
|
|
+ SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
|
|
|
+ const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
|
|
|
+ : SvPVX(sv);
|
|
|
+ const STRLEN newlen = SvCUR(sv);
|
|
|
+ UV uv = UNICODE_REPLACEMENT;
|
|
|
+
|
|
|
+ if (newlen)
|
|
|
+ uv = SvUTF8(sv)
|
|
|
+ ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
|
|
|
+ : *(U8*)s;
|
|
|
+
|
|
|
+ if (!newlen || numlen != newlen) {
|
|
|
+ uv = UNICODE_REPLACEMENT;
|
|
|
+ if (encp)
|
|
|
+ *encp = NULL;
|
|
|
+ }
|
|
|
+ return uv;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
- regatom - the lowest level
|
|
|
*
|
|
|
* Optimization: gobbles an entire sequence of ordinary characters so that
|
|
|
@@ -3182,6 +3215,8 @@ tryagain:
|
|
|
ender = grok_hex(p, &numlen, &flags, NULL);
|
|
|
p += numlen;
|
|
|
}
|
|
|
+ if (PL_encoding && ender < 0x100)
|
|
|
+ goto recode_encoding;
|
|
|
break;
|
|
|
case 'c':
|
|
|
p++;
|
|
|
@@ -3201,6 +3236,17 @@ tryagain:
|
|
|
--p;
|
|
|
goto loopdone;
|
|
|
}
|
|
|
+ if (PL_encoding && ender < 0x100)
|
|
|
+ goto recode_encoding;
|
|
|
+ break;
|
|
|
+ recode_encoding:
|
|
|
+ {
|
|
|
+ SV* enc = PL_encoding;
|
|
|
+ ender = reg_recode((const char)(U8)ender, &enc);
|
|
|
+ if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
|
|
|
+ vWARN(p, "Invalid escape in the specified encoding");
|
|
|
+ RExC_utf8 = 1;
|
|
|
+ }
|
|
|
break;
|
|
|
case '\0':
|
|
|
if (p >= RExC_end)
|
|
|
@@ -3331,32 +3377,6 @@ tryagain:
|
|
|
break;
|
|
|
}
|
|
|
|
|
|
- /* If the encoding pragma is in effect recode the text of
|
|
|
- * any EXACT-kind nodes. */
|
|
|
- if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
|
|
|
- STRLEN oldlen = STR_LEN(ret);
|
|
|
- SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
|
|
|
-
|
|
|
- if (RExC_utf8)
|
|
|
- SvUTF8_on(sv);
|
|
|
- if (sv_utf8_downgrade(sv, TRUE)) {
|
|
|
- const char * const s = sv_recode_to_utf8(sv, PL_encoding);
|
|
|
- const STRLEN newlen = SvCUR(sv);
|
|
|
-
|
|
|
- if (SvUTF8(sv))
|
|
|
- RExC_utf8 = 1;
|
|
|
- if (!SIZE_ONLY) {
|
|
|
- DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
|
|
|
- (int)oldlen, STRING(ret),
|
|
|
- (int)newlen, s));
|
|
|
- Copy(s, STRING(ret), newlen, char);
|
|
|
- STR_LEN(ret) += newlen - oldlen;
|
|
|
- RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
|
|
|
- } else
|
|
|
- RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
return(ret);
|
|
|
}
|
|
|
|
|
|
@@ -3734,6 +3754,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
|
|
|
value = grok_hex(RExC_parse, &numlen, &flags, NULL);
|
|
|
RExC_parse += numlen;
|
|
|
}
|
|
|
+ if (PL_encoding && value < 0x100)
|
|
|
+ goto recode_encoding;
|
|
|
break;
|
|
|
case 'c':
|
|
|
value = UCHARAT(RExC_parse++);
|
|
|
@@ -3741,13 +3763,24 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
|
|
|
break;
|
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
|
case '5': case '6': case '7': case '8': case '9':
|
|
|
- {
|
|
|
- I32 flags = 0;
|
|
|
- numlen = 3;
|
|
|
- value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
|
|
|
- RExC_parse += numlen;
|
|
|
- break;
|
|
|
- }
|
|
|
+ {
|
|
|
+ I32 flags = 0;
|
|
|
+ numlen = 3;
|
|
|
+ value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
|
|
|
+ RExC_parse += numlen;
|
|
|
+ if (PL_encoding && value < 0x100)
|
|
|
+ goto recode_encoding;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ recode_encoding:
|
|
|
+ {
|
|
|
+ SV* enc = PL_encoding;
|
|
|
+ value = reg_recode((const char)(U8)value, &enc);
|
|
|
+ if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
|
|
|
+ vWARN(RExC_parse,
|
|
|
+ "Invalid escape in the specified encoding");
|
|
|
+ break;
|
|
|
+ }
|
|
|
default:
|
|
|
if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
|
|
|
vWARN2(RExC_parse,
|
|
|
diff --git a/t/uni/tr_utf8.t b/t/uni/tr_utf8.t
|
|
|
index 606a84a..354156a 100755
|
|
|
--- a/t/uni/tr_utf8.t
|
|
|
+++ b/t/uni/tr_utf8.t
|
|
|
@@ -31,7 +31,7 @@ BEGIN {
|
|
|
}
|
|
|
|
|
|
use strict;
|
|
|
-use Test::More tests => 7;
|
|
|
+use Test::More tests => 8;
|
|
|
|
|
|
use encoding 'utf8';
|
|
|
|
|
|
@@ -67,4 +67,12 @@ is($str, $hiragana, "s/// # hiragana -> katakana");
|
|
|
$line =~ tr/bcdeghijklmnprstvwxyz$02578/בצדעגהיײקלמנפּרסטװשכיזשױתײחא/;
|
|
|
is($line, "aבצדעfגהיײקלמנoפqּרסuטװשכיזש1ױ34ת6ײח9", "[perl #16843]");
|
|
|
}
|
|
|
+
|
|
|
+{
|
|
|
+ # [perl #40641]
|
|
|
+ my $str = qq/Gebääääääääääääääääääääude/;
|
|
|
+ my $reg = qr/Gebääääääääääääääääääääude/;
|
|
|
+ ok($str =~ /$reg/, "[perl #40641]");
|
|
|
+}
|
|
|
+
|
|
|
__END__
|
|
|
diff --git a/utf8.h b/utf8.h
|
|
|
index 6d63897..3800866 100644
|
|
|
--- a/utf8.h
|
|
|
+++ b/utf8.h
|
|
|
@@ -198,6 +198,8 @@ encoded character.
|
|
|
UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
|
|
|
#define UTF8_ALLOW_ANY 0x00FF
|
|
|
#define UTF8_CHECK_ONLY 0x0200
|
|
|
+#define UTF8_ALLOW_DEFAULT (ckWARN(WARN_UTF8) ? 0 : \
|
|
|
+ UTF8_ALLOW_ANYUV)
|
|
|
|
|
|
#define UNICODE_SURROGATE_FIRST 0xD800
|
|
|
#define UNICODE_SURROGATE_LAST 0xDFFF
|