You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
gentoo-overlay/dev-lang/perl/files/perl-5.8.8-CVE-2008-1927.patch

257 lines
7.9 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

Fix a double free / segfault with utf8 regexps
Debian #454792
[rt.cpan.org #48156]
[rt.cpan.org #40641]
upstream change 29204
UTF8_ALLOW_DEFAULT definition in utf8.h picked from upstream change 27688
diff --git a/embed.fnc b/embed.fnc
index edfbc0e..26524c7 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1168,6 +1168,7 @@ Es |void |reguni |NN const struct RExC_state_t *state|UV uv|NN char *s|NN STRLE
Es |regnode*|regclass |NN struct RExC_state_t *state
ERs |I32 |regcurly |NN const char *
Es |regnode*|reg_node |NN struct RExC_state_t *state|U8 op
+Es |UV |reg_recode |const char value|NULLOK SV **encp
Es |regnode*|regpiece |NN struct RExC_state_t *state|NN I32 *flagp
Es |void |reginsert |NN struct RExC_state_t *state|U8 op|NN regnode *opnd
Es |void |regoptail |NN struct RExC_state_t *state|NN regnode *p|NN regnode *val
diff --git a/embed.h b/embed.h
index 2b38fd5..372b04f 100644
--- a/embed.h
+++ b/embed.h
@@ -1234,6 +1234,7 @@
#define regclass S_regclass
#define regcurly S_regcurly
#define reg_node S_reg_node
+#define reg_recode S_reg_recode
#define regpiece S_regpiece
#define reginsert S_reginsert
#define regoptail S_regoptail
@@ -3277,6 +3278,7 @@
#define regclass(a) S_regclass(aTHX_ a)
#define regcurly(a) S_regcurly(aTHX_ a)
#define reg_node(a,b) S_reg_node(aTHX_ a,b)
+#define reg_recode(a,b) S_reg_recode(aTHX_ a,b)
#define regpiece(a,b) S_regpiece(aTHX_ a,b)
#define reginsert(a,b,c) S_reginsert(aTHX_ a,b,c)
#define regoptail(a,b,c) S_regoptail(aTHX_ a,b,c)
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 9b3134c..7d95216 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -1900,6 +1900,15 @@ recognized by Perl or by a user-supplied handler. See L<attributes>.
(W printf) Perl does not understand the given format conversion. See
L<perlfunc/sprintf>.
+=item Invalid escape in the specified encoding in regex; marked by <-- HERE in m/%s/
+
+(W regexp) The numeric escape (for example C<\xHH>) of value < 256
+didn't correspond to a single character through the conversion
+from the encoding specified by the encoding pragma.
+The escape was replaced with REPLACEMENT CHARACTER (U+FFFD) instead.
+The <-- HERE shows in the regular expression about where the
+escape was discovered.
+
=item Invalid [] range "%s" in regex; marked by <-- HERE in m/%s/
(F) The range specified in a character class had a minimum character
diff --git a/proto.h b/proto.h
index 6d185dd..ef6c0cf 100644
--- a/proto.h
+++ b/proto.h
@@ -1748,6 +1748,7 @@ STATIC I32 S_regcurly(pTHX_ const char *)
__attribute__warn_unused_result__;
STATIC regnode* S_reg_node(pTHX_ struct RExC_state_t *state, U8 op);
+STATIC UV S_reg_recode(pTHX_ const char value, SV **encp);
STATIC regnode* S_regpiece(pTHX_ struct RExC_state_t *state, I32 *flagp);
STATIC void S_reginsert(pTHX_ struct RExC_state_t *state, U8 op, regnode *opnd);
STATIC void S_regoptail(pTHX_ struct RExC_state_t *state, regnode *p, regnode *val);
diff --git a/regcomp.c b/regcomp.c
index 928cf39..98d48dd 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2791,6 +2791,39 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
}
/*
+ * reg_recode
+ *
+ * It returns the code point in utf8 for the value in *encp.
+ * value: a code value in the source encoding
+ * encp: a pointer to an Encode object
+ *
+ * If the result from Encode is not a single character,
+ * it returns U+FFFD (Replacement character) and sets *encp to NULL.
+ */
+STATIC UV
+S_reg_recode(pTHX_ const char value, SV **encp)
+{
+ STRLEN numlen = 1;
+ SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
+ const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
+ : SvPVX(sv);
+ const STRLEN newlen = SvCUR(sv);
+ UV uv = UNICODE_REPLACEMENT;
+
+ if (newlen)
+ uv = SvUTF8(sv)
+ ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
+ : *(U8*)s;
+
+ if (!newlen || numlen != newlen) {
+ uv = UNICODE_REPLACEMENT;
+ if (encp)
+ *encp = NULL;
+ }
+ return uv;
+}
+
+/*
- regatom - the lowest level
*
* Optimization: gobbles an entire sequence of ordinary characters so that
@@ -3182,6 +3215,8 @@ tryagain:
ender = grok_hex(p, &numlen, &flags, NULL);
p += numlen;
}
+ if (PL_encoding && ender < 0x100)
+ goto recode_encoding;
break;
case 'c':
p++;
@@ -3201,6 +3236,17 @@ tryagain:
--p;
goto loopdone;
}
+ if (PL_encoding && ender < 0x100)
+ goto recode_encoding;
+ break;
+ recode_encoding:
+ {
+ SV* enc = PL_encoding;
+ ender = reg_recode((const char)(U8)ender, &enc);
+ if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
+ vWARN(p, "Invalid escape in the specified encoding");
+ RExC_utf8 = 1;
+ }
break;
case '\0':
if (p >= RExC_end)
@@ -3331,32 +3377,6 @@ tryagain:
break;
}
- /* If the encoding pragma is in effect recode the text of
- * any EXACT-kind nodes. */
- if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
- STRLEN oldlen = STR_LEN(ret);
- SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
-
- if (RExC_utf8)
- SvUTF8_on(sv);
- if (sv_utf8_downgrade(sv, TRUE)) {
- const char * const s = sv_recode_to_utf8(sv, PL_encoding);
- const STRLEN newlen = SvCUR(sv);
-
- if (SvUTF8(sv))
- RExC_utf8 = 1;
- if (!SIZE_ONLY) {
- DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
- (int)oldlen, STRING(ret),
- (int)newlen, s));
- Copy(s, STRING(ret), newlen, char);
- STR_LEN(ret) += newlen - oldlen;
- RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
- } else
- RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
- }
- }
-
return(ret);
}
@@ -3734,6 +3754,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
value = grok_hex(RExC_parse, &numlen, &flags, NULL);
RExC_parse += numlen;
}
+ if (PL_encoding && value < 0x100)
+ goto recode_encoding;
break;
case 'c':
value = UCHARAT(RExC_parse++);
@@ -3741,13 +3763,24 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
- {
- I32 flags = 0;
- numlen = 3;
- value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
- RExC_parse += numlen;
- break;
- }
+ {
+ I32 flags = 0;
+ numlen = 3;
+ value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
+ RExC_parse += numlen;
+ if (PL_encoding && value < 0x100)
+ goto recode_encoding;
+ break;
+ }
+ recode_encoding:
+ {
+ SV* enc = PL_encoding;
+ value = reg_recode((const char)(U8)value, &enc);
+ if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
+ vWARN(RExC_parse,
+ "Invalid escape in the specified encoding");
+ break;
+ }
default:
if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
vWARN2(RExC_parse,
diff --git a/t/uni/tr_utf8.t b/t/uni/tr_utf8.t
index 606a84a..354156a 100755
--- a/t/uni/tr_utf8.t
+++ b/t/uni/tr_utf8.t
@@ -31,7 +31,7 @@ BEGIN {
}
use strict;
-use Test::More tests => 7;
+use Test::More tests => 8;
use encoding 'utf8';
@@ -67,4 +67,12 @@ is($str, $hiragana, "s/// # hiragana -> katakana");
$line =~ tr/bcdeghijklmnprstvwxyz$02578/בצדעגהיײקלמנפּרסטװשכיזשױתײחא/;
is($line, "aבצדעfגהיײקלמנoפqּרסuטװשכיזש1ױ34ת6ײח9", "[perl #16843]");
}
+
+{
+ # [perl #40641]
+ my $str = qq/Gebääääääääääääääääääääude/;
+ my $reg = qr/Gebääääääääääääääääääääude/;
+ ok($str =~ /$reg/, "[perl #40641]");
+}
+
__END__
diff --git a/utf8.h b/utf8.h
index 6d63897..3800866 100644
--- a/utf8.h
+++ b/utf8.h
@@ -198,6 +198,8 @@ encoded character.
UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
#define UTF8_ALLOW_ANY 0x00FF
#define UTF8_CHECK_ONLY 0x0200
+#define UTF8_ALLOW_DEFAULT (ckWARN(WARN_UTF8) ? 0 : \
+ UTF8_ALLOW_ANYUV)
#define UNICODE_SURROGATE_FIRST 0xD800
#define UNICODE_SURROGATE_LAST 0xDFFF