diff --git a/string.c b/string.c index c63f59a..95b13c8 100644 --- a/string.c +++ b/string.c @@ -6097,6 +6097,106 @@ rb_str_split(VALUE str, const char *sep0) return rb_str_split_m(1, &sep, str); } +static VALUE rb_str_valid_encoding_p(VALUE str); + +static void +line_yield(VALUE str, const char *sub, const char *subend) +{ + long len = RSTRING_LEN(str); + const char *ptr = RSTRING_PTR(str); + VALUE line = rb_str_new5(str, sub, subend - sub); + + OBJ_INFECT(line, str); + rb_enc_cr_str_copy_for_substr(line, str); + rb_yield(line); + str_mod_check(str, ptr, len); + + return; +} + +static void +str_each_line_valid(VALUE str, VALUE rs, unsigned int newline, rb_encoding *enc) +{ + int n, rspara = 0; + long index, rslen; + const char *ptr, *pend, *sub, *subend; + const char *adjusted, *rsptr; + + ptr = sub = subend = RSTRING_PTR(str); + pend = RSTRING_END(str); + + rslen = RSTRING_LEN(rs); + + if (rslen == 0) { + rspara = 1; + rs = rb_usascii_str_new("\n\n", 2); + if (!rb_enc_asciicompat(enc)) + rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil); + rslen = RSTRING_LEN(rs); + } + + rsptr = RSTRING_PTR(rs); + + while (sub < pend) { + index = rb_memsearch(rsptr, rslen, sub, pend - sub, enc); + if (index < 0) break; + subend = sub + index + rslen; + if (rspara) { + rb_enc_codepoint_len(subend, pend, &n, enc); + while (subend < pend && rb_enc_codepoint(subend, pend, enc) == newline) + subend += n; + } + adjusted = rb_enc_left_char_head(ptr, subend, pend, enc); + if (subend == adjusted) line_yield(str, sub, subend); + sub = subend; + } + + if (subend < pend) line_yield(str, subend, pend); + + return; +} + +static void +str_each_line_invalid(VALUE str, VALUE rs, unsigned int newline, rb_encoding *enc) +{ + int n; + long rslen; + const char *sub, *subend, *pend, *rsptr; + + sub = subend = RSTRING_PTR(str); + pend = RSTRING_END(str); + + rsptr = RSTRING_PTR(rs); + rslen = RSTRING_LEN(rs); + + while (sub < pend) { + unsigned int c = rb_enc_codepoint_len(sub, pend, &n, enc); + + again: + if (rslen == 0 && c == newline) { + subend += n; + if (subend < pend && (c = rb_enc_codepoint_len(subend, pend, &n, enc)) != newline) { + goto again; + } + while (subend < pend && rb_enc_codepoint(subend, pend, enc) == newline) { + subend += n; + } + subend -= n; + } + if (c == newline && + (rslen <= 1 || + (pend - subend >= rslen && memcmp(rsptr, subend, rslen) == 0))) { + subend += rslen ? rslen : n; + line_yield(str, sub, subend); + sub = subend; + } + subend += n; + } + + if (subend < pend) line_yield(str, subend, pend); + + return; +} /* * call-seq: @@ -6141,95 +6241,40 @@ rb_str_each_line(int argc, VALUE *argv, VALUE str) rb_encoding *enc; VALUE rs; unsigned int newline; - const char *p, *pend, *s, *ptr; - long len, rslen; - VALUE line; - int n; VALUE orig = str; - if (argc == 0) { + if (argc == 0) rs = rb_rs; - } - else { + else rb_scan_args(argc, argv, "01", &rs); - } + RETURN_ENUMERATOR(str, argc, argv); + if (NIL_P(rs)) { rb_yield(str); return orig; } - str = rb_str_new4(str); - ptr = p = s = RSTRING_PTR(str); - pend = p + RSTRING_LEN(str); - len = RSTRING_LEN(str); StringValue(rs); - if (rs == rb_default_rs) { - enc = rb_enc_get(str); - while (p < pend) { - char *p0; - p = memchr(p, '\n', pend - p); - if (!p) break; - p0 = rb_enc_left_char_head(s, p, pend, enc); - if (!rb_enc_is_newline(p0, pend, enc)) { - p++; - continue; - } - p = p0 + rb_enc_mbclen(p0, pend, enc); - line = rb_str_new5(str, s, p - s); - OBJ_INFECT(line, str); - rb_enc_cr_str_copy_for_substr(line, str); - rb_yield(line); - str_mod_check(str, ptr, len); - s = p; - } - goto finish; - } + str = rb_str_new4(str); - enc = rb_enc_check(str, rs); - rslen = RSTRING_LEN(rs); - if (rslen == 0) { - newline = '\n'; + if (rs == rb_rs) { + enc = rb_enc_get(str); + rs = rb_str_encode(rb_rs, rb_enc_from_encoding(enc), 0, Qnil); } else { - newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc); + enc = rb_enc_check(str, rs); } - while (p < pend) { - unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc); - - again: - if (rslen == 0 && c == newline) { - p += n; - if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) { - goto again; - } - while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) { - p += n; - } - p -= n; - } - if (c == newline && - (rslen <= 1 || - (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) { - line = rb_str_new5(str, s, p - s + (rslen ? rslen : n)); - OBJ_INFECT(line, str); - rb_enc_cr_str_copy_for_substr(line, str); - rb_yield(line); - str_mod_check(str, ptr, len); - s = p + (rslen ? rslen : n); - } - p += n; - } + if (RSTRING_LEN(rs) == 0) + newline = '\n'; + else + newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc); - finish: - if (s != pend) { - line = rb_str_new5(str, s, pend - s); - OBJ_INFECT(line, str); - rb_enc_cr_str_copy_for_substr(line, str); - rb_yield(line); - RB_GC_GUARD(str); - } + if (rb_str_valid_encoding_p(str) && rb_str_valid_encoding_p(rs)) + str_each_line_valid(str, rs, newline, enc); + else + str_each_line_invalid(str, rs, newline, enc); return orig; }