Bug #4044: Regex matching errors when using \W character class and /i option - Ruby (original) (raw)
It is still a hack.
Current behavior has a reason:\W
-> (ignore case) -> \W
(\u017F
) + s + S + ... -> not
An experimental patch is following but this is also wrong.
diff --git a/ChangeLog b/ChangeLog
index 18567e3..9dbe329 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+Wed Nov 17 17:19:02 2010 NARUSE, Yui <naruse@ruby-lang.org>
+
+ * regparse.c: don't apply ignore case to posix bracket, character
+ type, and character property. [ruby-core:33139]
+
Wed Nov 17 15:16:48 2010 NARUSE, Yui <naruse@ruby-lang.org>
* regint.h (OnigOpInfoType): constify name.
diff --git a/regparse.c b/regparse.c
index bf40603..118081f 100644
--- a/regparse.c
+++ b/regparse.c
@@ -4270,6 +4270,8 @@ code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
return 0;
}
+static int cclass_case_fold(Node** np, CClassNode *cc, ScanEnv* env);
+
static int
parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
ScanEnv* env)
@@ -4279,13 +4281,14 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
UChar *p;
Node* node;
CClassNode *cc, *prev_cc;
- CClassNode work_cc;
+ CClassNode work_cc, cased_cc;
enum CCSTATE state;
enum CCVALTYPE val_type, in_type;
int val_israw, in_israw;
prev_cc = (CClassNode* )NULL;
+ initialize_cclass(&cased_cc);
*np = NULL_NODE;
r = fetch_token_in_cc(tok, src, end, env);
if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
@@ -4406,7 +4409,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
break;
case TK_POSIX_BRACKET_OPEN:
- r = parse_posix_bracket(cc, &p, end, env);
+ r = parse_posix_bracket(&cased_cc, &p, end, env);
if (r < 0) goto err;
if (r == 1) { /* is not POSIX bracket */
CC_ESC_WARN(env, (UChar* )"[");
@@ -4419,7 +4422,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
break;
case TK_CHAR_TYPE:
- r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env);
+ r = add_ctype_to_cc(&cased_cc, tok->u.prop.ctype, tok->u.prop.not, env);
if (r != 0) return r;
next_class:
@@ -4433,7 +4436,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
ctype = fetch_char_property_to_ctype(&p, end, env);
if (ctype < 0) return ctype;
- r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
+ r = add_ctype_to_cc(&cased_cc, ctype, tok->u.prop.not, env);
if (r != 0) return r;
goto next_class;
}
@@ -4501,7 +4504,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
r = parse_char_class(&anode, tok, &p, end, env);
if (r == 0) {
acc = NCCLASS(anode);
- r = or_cclass(cc, acc, env);
+ r = or_cclass(&cased_cc, acc, env);
}
onig_node_free(anode);
if (r != 0) goto err;
@@ -4519,6 +4522,13 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
and_start = 1;
state = CCS_START;
+ if (IS_IGNORECASE(env->option)) {
+ cclass_case_fold(np, cc, env);
+ }
+ if (IS_NOT_NULL(&cased_cc)) {
+ r = or_cclass(cc, &cased_cc, env);
+ initialize_cclass(&cased_cc);
+ }
if (IS_NOT_NULL(prev_cc)) {
r = and_cclass(prev_cc, cc, env);
if (r != 0) goto err;
@@ -4556,6 +4566,13 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
if (r != 0) goto err;
}
+ if (IS_IGNORECASE(env->option)) {
+ cclass_case_fold(np, cc, env);
+ }
+ if (IS_NOT_NULL(&cased_cc)) {
+ r = or_cclass(cc, &cased_cc, env);
+ initialize_cclass(&cased_cc);
+ }
if (IS_NOT_NULL(prev_cc)) {
r = and_cclass(prev_cc, cc, env);
if (r != 0) goto err;
@@ -5136,6 +5153,32 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
}
static int
+cclass_case_fold(Node** np, CClassNode *cc, ScanEnv* env)
+{
+ int r;
+ IApplyCaseFoldArg iarg;
+ iarg.env = env;
+ iarg.cc = cc;
+ iarg.alt_root = NULL_NODE;
+ iarg.ptail = &(iarg.alt_root);
+
+ r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
+ i_apply_case_fold, &iarg);
+ if (r != 0) {
+ onig_node_free(iarg.alt_root);
+ return r;
+ }
+ if (IS_NOT_NULL(iarg.alt_root)) {
+ Node* work = onig_node_new_alt(*np, iarg.alt_root);
+ if (IS_NULL(work)) {
+ onig_node_free(iarg.alt_root);
+ return ONIGERR_MEMORY;
+ }
+ *np = work;
+ }
+ return r;
+}
+static int
parse_exp(Node** np, OnigToken* tok, int term,
UChar** src, UChar* end, ScanEnv* env)
{
@@ -5382,35 +5425,8 @@ parse_exp(Node** np, OnigToken* tok, int term,
case TK_CC_OPEN:
{
- CClassNode* cc;
-
r = parse_char_class(np, tok, src, end, env);
if (r != 0) return r;
-
- cc = NCCLASS(*np);
- if (IS_IGNORECASE(env->option)) {
- IApplyCaseFoldArg iarg;
-
- iarg.env = env;
- iarg.cc = cc;
- iarg.alt_root = NULL_NODE;
- iarg.ptail = &(iarg.alt_root);
-
- r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
- i_apply_case_fold, &iarg);
- if (r != 0) {
- onig_node_free(iarg.alt_root);
- return r;
- }
- if (IS_NOT_NULL(iarg.alt_root)) {
- Node* work = onig_node_new_alt(*np, iarg.alt_root);
- if (IS_NULL(work)) {
- onig_node_free(iarg.alt_root);
- return ONIGERR_MEMORY;
- }
- *np = work;
- }
- }
}
break;
diff --git a/test/ruby/test_regexp.rb b/test/ruby/test_regexp.rb
index 346979d..aaceacf 100644
--- a/test/ruby/test_regexp.rb
+++ b/test/ruby/test_regexp.rb
@@ -190,6 +190,16 @@ class TestRegexp < Test::Unit::TestCase
assert_equal(false, /(?i:a)/.casefold?)
end
+ def test_caseless_match
+ assert_match(/a/iu, "A")
+ assert_match(/[a-z]/iu, "A")
+ assert_not_match(/[:lower:]/iu, "A")
+ assert_not_match(/\p{Ll}/iu, "A")
+ assert_not_match(/\p{Lower}/iu, "A")
+ assert_match(/[^\p{Lower}]/iu, "A")
+ assert_match(/[^\W]/iu, "A")
+ end
+
def test_options
assert_equal(Regexp::IGNORECASE, /a/i.options)
assert_equal(Regexp::EXTENDED, /a/x.options)