Branch: Tag:

2017-02-11

2017-02-11 13:26:27 by Arne Goedeke <el@laramies.com>

Faster string_to_utf8()

This commit splits the length calculation of the resulting string and the
actual encoding into two new functions. This makes it possible to e.g.
encode utf8 directly into a buffer.

The length calculation has been rewritten for different shift sizes. For
8bit strings it uses a popcount loop, which calculates the number of
high bits (code points bigger than 0x7f) on machine size chunks. On
machines which have popcount instructions this is much faster.
With compilers which do not support __builtin_popcount it uses a simple
manual popcount.

For 16bit and 32bit strings the length calculation uses clz to count the
number of bits in the codepoint to calculate the length without branches.

The encoding function is split into one version for each shift size.
For 32bit strings it avoids branches by using the resulting byte
lengths as a jump size. This generates reasonable code, at least in gcc.

Benchmark results on my i7:

utf8/code.pike#encode_7bit | 1.3 G 1.6 % | 8.3 G 3.4 % |
utf8/code.pike#encode_8bit | 651.1 M 1.8 % | 1.1 G 1.2 % |
utf8/code.pike#encode_arabic | 498.4 M 0.8 % | 710.3 M 1.2 % |
utf8/code.pike#encode_bulgarian | 488.2 M 1.2 % | 688.4 M 2.6 % |
utf8/code.pike#encode_estonian | 614.8 M 6.6 % | 969.5 M 1.5 % |
utf8/code.pike#encode_hebrew | 496.9 M 1.8 % | 710.1 M 1.0 % |
utf8/code.pike#encode_japanese | 704.9 M 4.0 % | 785.4 M 1.6 % |
utf8/code.pike#encode_polish | 388.9 M 0.4 % | 710.1 M 1.3 % |
utf8/code.pike#encode_thai | 642.8 M 3.3 % | 858.0 M 0.9 % |
utf8/code.pike#encode_yiddish | 485.9 M 3.3 % | 692.5 M 3.8 % |

I also tested on arm32, the speedups are around 50%.

2123:    ptrdiff_t len;    struct pike_string *in;    struct pike_string *out; -  ptrdiff_t i; +     INT_TYPE extended = 0;    PCHARP src;    INT32 min, max;
2131:       get_all_args("string_to_utf8", args, "%W.%i", &in, &extended);    -  len = in->len; -  +     check_string_range(in, 1, &min, &max);       if (min >= 0 && max <= 0x7f) {
2141:    return;    }    -  for(i=0,src=MKPCHARP_STR(in); i < in->len; INC_PCHARP(src,1),i++) { -  unsigned INT32 c = EXTRACT_PCHARP(src); -  if (c & ~0x7f) { -  /* 8bit or more. */ -  len++; -  if (c & ~0x7ff) { -  /* 12bit or more. */ -  len++; -  if (c & ~0xffff) { -  /* 17bit or more. */ -  len++; -  if (!extended && c > 0x10ffff) -  bad_arg_error ("string_to_utf8", Pike_sp - args, args, 1, -  NULL, Pike_sp - args, -  "Character 0x%08x at index %"PRINTPTRDIFFT"d is " -  "outside the allowed range.\n", -  c, i); -  if (c & ~0x1fffff) { -  /* 22bit or more. */ -  len++; -  if (c & ~0x3ffffff) { -  /* 27bit or more. */ -  len++; -  if (c & ~0x7fffffff) { -  /* 32bit or more. */ -  len++; -  /* FIXME: Needs fixing when we get 64bit chars... */ -  } -  } -  } -  } -  else if (!extended && c >= 0xd800 && c <= 0xdfff) -  bad_arg_error ("string_to_utf8", Pike_sp - args, args, 1, -  NULL, Pike_sp - args, -  "Character 0x%08x at index %"PRINTPTRDIFFT"d is " -  "in the surrogate range and therefore invalid.\n", -  c, i); -  } -  } -  } +  len = pike_string_utf8_length(in, args, extended); +     if (len == in->len) {    /* 7bit string -- already valid utf8. */    pop_n_elems(args - 1);    return;    }    out = begin_shared_string(len); -  dst = STR0(out); +     -  for(i=0,src=MKPCHARP_STR(in); i < in->len; INC_PCHARP(src,1),i++) { -  unsigned INT32 c = EXTRACT_PCHARP(src); -  if (!(c & ~0x7f)) { -  /* 7bit */ -  *dst++ = c; -  } else if (!(c & ~0x7ff)) { -  /* 11bit */ -  *dst++ = 0xc0 | (c >> 6); -  *dst++ = 0x80 | (c & 0x3f); -  } else if (!(c & ~0xffff)) { -  /* 16bit */ -  *dst++ = 0xe0 | (c >> 12); -  *dst++ = 0x80 | ((c >> 6) & 0x3f); -  *dst++ = 0x80 | (c & 0x3f); -  } else if (!(c & ~0x1fffff)) { -  /* 21bit */ -  *dst++ = 0xf0 | (c >> 18); -  *dst++ = 0x80 | ((c >> 12) & 0x3f); -  *dst++ = 0x80 | ((c >> 6) & 0x3f); -  *dst++ = 0x80 | (c & 0x3f); -  } else if (!(c & ~0x3ffffff)) { -  /* 26bit */ -  *dst++ = 0xf8 | (c >> 24); -  *dst++ = 0x80 | ((c >> 18) & 0x3f); -  *dst++ = 0x80 | ((c >> 12) & 0x3f); -  *dst++ = 0x80 | ((c >> 6) & 0x3f); -  *dst++ = 0x80 | (c & 0x3f); -  } else if (!(c & ~0x7fffffff)) { -  /* 31bit */ -  *dst++ = 0xfc | (c >> 30); -  *dst++ = 0x80 | ((c >> 24) & 0x3f); -  *dst++ = 0x80 | ((c >> 18) & 0x3f); -  *dst++ = 0x80 | ((c >> 12) & 0x3f); -  *dst++ = 0x80 | ((c >> 6) & 0x3f); -  *dst++ = 0x80 | (c & 0x3f); -  } else { -  /* 32 - 36bit */ -  *dst++ = (char)0xfe; -  *dst++ = 0x80 | ((c >> 30) & 0x3f); -  *dst++ = 0x80 | ((c >> 24) & 0x3f); -  *dst++ = 0x80 | ((c >> 18) & 0x3f); -  *dst++ = 0x80 | ((c >> 12) & 0x3f); -  *dst++ = 0x80 | ((c >> 6) & 0x3f); -  *dst++ = 0x80 | (c & 0x3f); -  } -  } +  dst = pike_string_utf8_encode(STR0(out), in);   #ifdef PIKE_DEBUG    if (len != dst - STR0(out)) {    Pike_fatal("string_to_utf8(): Calculated and actual lengths differ: "