Feature #15331 » 0002-Hash-code-memoization-for-short-fstrings.patch
benchmark/freeze_unique_strings.yml | ||
---|---|---|
prelude: |
|
||
str = +"0000000"
|
||
benchmark:
|
||
freeze_unique_strings: |
|
||
str.succ!
|
||
-str
|
||
loop_count: 9000000 # freeze this many unique strings
|
benchmark/hash_aref_fstr.rb | ||
---|---|---|
h = {}
|
||
strs = ('a'..'z').to_a.map!(&:-@)
|
||
strs.each { |s| h[s] = s }
|
||
500_000.times { strs.each { |s| h[s] } }
|
benchmark/hash_aref_long_str.rb | ||
---|---|---|
h = {}
|
||
strs = ['a' * 100] * 10
|
||
strs.each { |s| h[s] = s }
|
||
200_000.times { strs.each { |s| h[s] } }
|
ext/-test-/string/cstr.c | ||
---|---|---|
rb_define_singleton_method(klass, "cstr_noembed", bug_str_s_cstr_noembed, 1);
|
||
rb_define_singleton_method(klass, "cstr_embedded?", bug_str_s_cstr_embedded_p, 1);
|
||
rb_define_singleton_method(klass, "rb_str_new_frozen", bug_str_s_rb_str_new_frozen, 1);
|
||
rb_define_const(klass, "HASH_MEMO_STR_BUFFER_CAPA", INT2NUM(HASH_MEMO_STR_BUFFER_CAPA));
|
||
}
|
internal.h | ||
---|---|---|
#define STR_SHARED_P(s) FL_ALL_RAW((s), STR_NOEMBED|ELTS_SHARED)
|
||
#define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
|
||
#define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
|
||
#define HASH_MEMO_STR_BUFFER_CAPA ((int) (SIZEOF_VALUE * 2))
|
||
size_t rb_str_memsize(VALUE);
|
||
VALUE rb_sym_proc_call(ID mid, int argc, const VALUE *argv, VALUE passed_proc);
|
||
VALUE rb_sym_to_proc(VALUE sym);
|
string.c | ||
---|---|---|
RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
|
||
} while (0)
|
||
struct memoized_hash_embeded_str {
|
||
char ary[HASH_MEMO_STR_BUFFER_CAPA];
|
||
st_index_t memoized_hash;
|
||
};
|
||
/* We need to have enough space for the string plus its encoding dependent terminator.
|
||
There are encodings that use two bytes, but we save on fetching the encoding
|
||
struct by being conservative. Change this when there is an encoding with 8-byte units.
|
||
See encindex.h. */
|
||
#define SHORT_ENOUGH_TO_MEMO_HASH(str, enc_idx) \
|
||
((RSTRING_EMBED_LEN(str) + ((enc_idx <= ENCINDEX_US_ASCII) ? 1 : 4)) <= HASH_MEMO_STR_BUFFER_CAPA)
|
||
STATIC_ASSERT(memoized_hash_type_punning, sizeof(struct memoized_hash_embeded_str) == sizeof(RSTRING(0)->as));
|
||
#define SET_HASH_MEMO(str, hash) do { \
|
||
struct RString *rstr = RSTRING(str);\
|
||
((struct memoized_hash_embeded_str*) &rstr->as)->memoized_hash = hash;\
|
||
} while (0)
|
||
#define GET_HASH_MEMO(str) (((struct memoized_hash_embeded_str*) &RSTRING(str)->as)->memoized_hash)
|
||
#define STR_SET_LEN(str, n) do { \
|
||
if (STR_EMBED_P(str)) {\
|
||
STR_SET_EMBED_LEN((str), (n));\
|
||
... | ... | |
rb_str_hash,
|
||
};
|
||
struct fstr_update {
|
||
VALUE ret;
|
||
st_index_t hash;
|
||
};
|
||
#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_TAINT|FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
|
||
static int
|
||
fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t arg, int existing)
|
||
{
|
||
VALUE *fstr = (VALUE *)arg;
|
||
struct fstr_update *update = (struct fstr_update *)arg;
|
||
VALUE str = (VALUE)*key;
|
||
if (existing) {
|
||
... | ... | |
* at next time */
|
||
if (rb_objspace_garbage_object_p(str)) {
|
||
*fstr = Qundef;
|
||
update->ret = Qundef;
|
||
return ST_DELETE;
|
||
}
|
||
*fstr = str;
|
||
update->ret = str;
|
||
return ST_STOP;
|
||
}
|
||
else {
|
||
... | ... | |
str = str_new_frozen(rb_cString, str);
|
||
}
|
||
}
|
||
if (STR_EMBED_P(str) && SHORT_ENOUGH_TO_MEMO_HASH(str, ENCODING_GET_INLINED(str))) {
|
||
SET_HASH_MEMO(str, update->hash);
|
||
}
|
||
RBASIC(str)->flags |= RSTRING_FSTR;
|
||
*key = *value = *fstr = str;
|
||
*key = *value = str;
|
||
update->ret = str;
|
||
return ST_CONTINUE;
|
||
}
|
||
}
|
||
... | ... | |
return fstr;
|
||
}
|
||
int
|
||
st_update_with_hash(st_table *tab, st_data_t key, st_index_t hash,
|
||
st_update_callback_func *func, st_data_t arg); /* st.c */
|
||
static VALUE
|
||
register_fstring(VALUE str)
|
||
{
|
||
VALUE ret;
|
||
struct fstr_update update;
|
||
st_table *frozen_strings = rb_vm_fstring_table();
|
||
st_index_t hash = rb_str_hash(str);
|
||
update.hash = hash;
|
||
do {
|
||
ret = str;
|
||
st_update(frozen_strings, (st_data_t)str,
|
||
fstr_update_callback, (st_data_t)&ret);
|
||
} while (ret == Qundef);
|
||
assert(OBJ_FROZEN(ret));
|
||
assert(!FL_TEST_RAW(ret, STR_FAKESTR));
|
||
assert(!FL_TEST_RAW(ret, FL_EXIVAR));
|
||
assert(!FL_TEST_RAW(ret, FL_TAINT));
|
||
assert(RBASIC_CLASS(ret) == rb_cString);
|
||
return ret;
|
||
update.ret = str;
|
||
st_update_with_hash(frozen_strings, (st_data_t)str, hash,
|
||
fstr_update_callback, (st_data_t)&update);
|
||
} while (update.ret == Qundef);
|
||
assert(OBJ_FROZEN(update.ret));
|
||
assert(!FL_TEST_RAW(update.ret, STR_FAKESTR));
|
||
assert(!FL_TEST_RAW(update.ret, FL_EXIVAR));
|
||
assert(!FL_TEST_RAW(update.ret, FL_TAINT));
|
||
assert(RBASIC_CLASS(update.ret) == rb_cString);
|
||
return update.ret;
|
||
}
|
||
static VALUE
|
||
... | ... | |
str_make_independent_expand(str, len, 0L, termlen);
|
||
}
|
||
else {
|
||
if (UNLIKELY(termlen > 4)) {
|
||
rb_raise(rb_eArgError, "terminator is longer than 4 bytes");
|
||
}
|
||
TERM_FILL(s + len, termlen);
|
||
return s;
|
||
}
|
||
... | ... | |
long capa = str_capacity(str, oldtermlen) + oldtermlen;
|
||
long len = RSTRING_LEN(str);
|
||
assert(termlen <= 4);
|
||
assert(capa >= len);
|
||
if (capa - len < termlen) {
|
||
rb_check_lockedtmp(str);
|
||
... | ... | |
rb_str_hash(VALUE str)
|
||
{
|
||
int e = ENCODING_GET(str);
|
||
if (FL_TEST_RAW(str, RSTRING_FSTR | RSTRING_NOEMBED) == RSTRING_FSTR && SHORT_ENOUGH_TO_MEMO_HASH(str, e)) {
|
||
return GET_HASH_MEMO(str);
|
||
}
|
||
if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
|
||
e = 0;
|
||
}
|
test/-ext-/string/test_cstr.rb | ||
---|---|---|
WCHARS = [Encoding::UTF_16BE, Encoding::UTF_16LE, Encoding::UTF_32BE, Encoding::UTF_32LE]
|
||
def test_fstring_hash_memo_wchar
|
||
WCHARS.each_with_index do |encoding|
|
||
(1..Bug::String::HASH_MEMO_STR_BUFFER_CAPA).each do |i|
|
||
contrived_string = +('8' * i)
|
||
contrived_string.force_encoding(encoding)
|
||
before = contrived_string.hash
|
||
fstring = -(contrived_string)
|
||
assert Bug::String.cstr_embedded?(fstring)
|
||
# StringValueCStr writes a terminator past the end of the string buffer,
|
||
# which could clobber the memoized hash.
|
||
Bug::String.cstr_term(fstring)
|
||
assert_equal(before, fstring.hash)
|
||
end
|
||
end
|
||
end
|
||
def test_wchar_embed
|
||
WCHARS.each do |enc|
|
||
s = Bug::String.new("\u{4022}a".encode(enc))
|
- « Previous
- 1
- …
- 6
- 7
- 8
- Next »