diff --git a/internal/string.h b/internal/string.h index 3533766ffb7b5c..4ccd13b9734538 100644 --- a/internal/string.h +++ b/internal/string.h @@ -31,6 +31,7 @@ enum ruby_rstring_private_flags { VALUE rb_fstring(VALUE); VALUE rb_fstring_cstr(const char *str); VALUE rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc); +void rb_str_buf_cat_byte(VALUE str, unsigned char byte); int rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p); int rb_str_symname_p(VALUE); VALUE rb_str_quote_unprintable(VALUE); diff --git a/string.c b/string.c index c97351c0d3a1c6..926c8fcfb3e620 100644 --- a/string.c +++ b/string.c @@ -3341,7 +3341,7 @@ rb_str_cat_cstr(VALUE str, const char *ptr) return rb_str_buf_cat(str, ptr, strlen(ptr)); } -static void +void rb_str_buf_cat_byte(VALUE str, unsigned char byte) { RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII); diff --git a/yjit/bindgen/src/main.rs b/yjit/bindgen/src/main.rs index 62c7ff2c795965..d844e22c3cf64c 100644 --- a/yjit/bindgen/src/main.rs +++ b/yjit/bindgen/src/main.rs @@ -228,6 +228,7 @@ fn main() { .allowlist_function("rb_str_concat_literals") .allowlist_function("rb_obj_as_string_result") .allowlist_function("rb_str_byte_substr") + .allowlist_function("rb_str_buf_cat_byte") // From include/ruby/internal/intern/parse.h .allowlist_function("rb_backref_get") diff --git a/yjit/src/codegen.rs b/yjit/src/codegen.rs index 961d6438e3d823..6a9bfcd3ef655f 100644 --- a/yjit/src/codegen.rs +++ b/yjit/src/codegen.rs @@ -5893,24 +5893,115 @@ fn jit_rb_str_empty_p( return true; } -// Codegen for rb_str_concat() -- *not* String#concat -// Frequently strings are concatenated using "out_str << next_str". -// This is common in Erb and similar templating languages. -fn jit_rb_str_concat( +// Codegen for rb_str_concat() with an integer argument -- *not* String#concat +// Using strings as a byte buffer often includes appending byte values to the end of the string. +fn jit_rb_str_concat_codepoint( jit: &mut JITState, asm: &mut Assembler, - _ocb: &mut OutlinedCb, + ocb: &mut OutlinedCb, _ci: *const rb_callinfo, _cme: *const rb_callable_method_entry_t, - _block: Option, + _block: Option, _argc: i32, _known_recv_class: Option, +) -> bool { + asm_comment!(asm, "String#<< with codepoint argument"); + + // Ensure the codepoint argument is a Fixnum. + let arg = asm.stack_opnd(0); + let comptime_arg = jit.peek_at_stack(&asm.ctx, 0); + if comptime_arg.fixnum_p() { + jit_guard_known_klass( + jit, + asm, + ocb, + comptime_arg.class_of(), + arg, + arg.into(), + comptime_arg, + SEND_MAX_DEPTH, + Counter::guard_send_not_fixnums, + ); + } else { + return false; + } + + // Either of the string concatenation functions we call will reallocate the string to grow its + // capacity if necessary. In extremely rare cases (i.e., string exceeds `LONG_MAX` bytes), + // either of the called functions will raise an exception. + jit_prepare_non_leaf_call(jit, asm); + + let codepoint = asm.stack_pop(1); + let recv = asm.stack_pop(1); + + // In order to use the fast path (rb_str_buf_cat_byte), the string encoding must be ASCII-8BIT + // and the codepoint must be in the byte range (0x00 - 0xff). + // If either of those conditions are not met we must use the general string concat (str_buf_cat) + // function with the original codepoint argument. + + let generic_str_concat_codepoint = asm.new_label("generic_str_concat_codepoint"); + let ret_label = asm.new_label("jit_rb_str_concat_codepoint_return"); + + // Check if the string is ASCII-8BIT. If it isn't, we need to use the generic string concatenation. + asm_comment!(asm, "Check if string is ASCII-8BIT"); + let recv_reg = asm.load(recv); + let flags_opnd = Opnd::mem(VALUE_BITS, recv_reg, RUBY_OFFSET_RBASIC_FLAGS); + let encoding_flags_opnd = asm.and(flags_opnd, Opnd::UImm(RUBY_ENCODING_MASK as u64)); + let encoding_index = asm.rshift(encoding_flags_opnd, Opnd::UImm(RUBY_ENCODING_SHIFT as u64)); + asm.cmp(encoding_index, Opnd::UImm(RUBY_ENCINDEX_ASCII_8BIT as u64)); + asm.jne(generic_str_concat_codepoint); + + // Check if the codepoint is limited to a byte value. If it isn't, we need to use the generic + // string concatenation, which will ultimately raise a `RangeError` as ASCII-8BIT strings only + // accept codepoint values that fit in a byte range. + asm_comment!(asm, "Check if codepoint is a byte value"); + let codepoint_untag = asm.rshift(codepoint, Opnd::UImm(1)); + asm.cmp(codepoint_untag, Opnd::Imm(0xff)); + asm.jg(generic_str_concat_codepoint); + + // If we've made it this far, we must be appending a single byte value to a binary string. + asm_comment!(asm, "Optimized (string, codepoint) concatenation"); + let ret_opnd = asm.ccall(rb_str_buf_cat_byte as *const u8, vec![recv, codepoint_untag]); + let stack_ret = asm.stack_push(Type::TString); + asm.mov(stack_ret, ret_opnd); + asm.stack_pop(1); + asm.jmp(ret_label); + + // Either the string isn't binary or the codepoint is too large. + asm_comment!(asm, "Fallback to generic (string, codepoint) concatenation"); + asm.write_label(generic_str_concat_codepoint); + let ret_opnd = asm.ccall(rb_str_concat as *const u8, vec![recv, codepoint]); + let stack_ret = asm.stack_push(Type::TString); + asm.mov(stack_ret, ret_opnd); + asm.jmp(ret_label); + + asm.write_label(ret_label); + + true +} + +// Codegen for rb_str_concat() -- *not* String#concat +// Frequently strings are concatenated using "out_str << next_str". +// This is common in Erb and similar templating languages. +fn jit_rb_str_concat( + jit: &mut JITState, + asm: &mut Assembler, + ocb: &mut OutlinedCb, + ci: *const rb_callinfo, + cme: *const rb_callable_method_entry_t, + block: Option, + argc: i32, + known_recv_class: Option, ) -> bool { // The << operator can accept integer codepoints for characters // as the argument. We only specially optimise string arguments. // If the peeked-at compile time argument is something other than // a string, assume it won't be a string later either. let comptime_arg = jit.peek_at_stack(&asm.ctx, 0); + if unsafe { RB_TYPE_P(comptime_arg, RUBY_T_FIXNUM) } { + return jit_rb_str_concat_codepoint(jit, asm, ocb, ci, cme, block, argc, known_recv_class); + } + if ! unsafe { RB_TYPE_P(comptime_arg, RUBY_T_STRING) } { return false; } @@ -10357,7 +10448,7 @@ pub fn yjit_reg_method_codegen_fns() { } // Register a specialized codegen function for a particular method. Note that -// the if the function returns true, the code it generates runs without a +// if the function returns true, the code it generates runs without a // control frame and without interrupt checks. To avoid creating observable // behavior changes, the codegen function should only target simple code paths // that do not allocate and do not make method calls. diff --git a/yjit/src/cruby.rs b/yjit/src/cruby.rs index b0691376640cc7..2d388a1a9f677a 100644 --- a/yjit/src/cruby.rs +++ b/yjit/src/cruby.rs @@ -117,6 +117,7 @@ extern "C" { ci: *const rb_callinfo, ) -> *const rb_callable_method_entry_t; pub fn rb_hash_empty_p(hash: VALUE) -> VALUE; + pub fn rb_str_concat(str1: VALUE, str2: VALUE) -> VALUE; pub fn rb_str_setbyte(str: VALUE, index: VALUE, value: VALUE) -> VALUE; pub fn rb_vm_splat_array(flag: VALUE, ary: VALUE) -> VALUE; pub fn rb_vm_concat_array(ary1: VALUE, ary2st: VALUE) -> VALUE; diff --git a/yjit/src/cruby_bindings.inc.rs b/yjit/src/cruby_bindings.inc.rs index cbe635f060e380..1e91996e9ab8eb 100644 --- a/yjit/src/cruby_bindings.inc.rs +++ b/yjit/src/cruby_bindings.inc.rs @@ -1078,6 +1078,7 @@ extern "C" { pub fn rb_gvar_set(arg1: ID, arg2: VALUE) -> VALUE; pub fn rb_ensure_iv_list_size(obj: VALUE, len: u32, newsize: u32); pub fn rb_vm_barrier(); + pub fn rb_str_buf_cat_byte(str_: VALUE, byte: ::std::os::raw::c_uchar); pub fn rb_str_byte_substr(str_: VALUE, beg: VALUE, len: VALUE) -> VALUE; pub fn rb_obj_as_string_result(str_: VALUE, obj: VALUE) -> VALUE; pub fn rb_str_concat_literals(num: usize, strary: *const VALUE) -> VALUE;