e576bb2002-10-11Martin Nilsson /* || This file is part of Pike. For copyright information see COPYRIGHT. || Pike is distributed under GPL, LGPL and MPL. See the file COPYING || for more information. */
1b10db2002-10-08Martin Nilsson 
387cbc2001-06-21Per Hedbor #include "global.h" #include "stralloc.h" #include "pike_macros.h" #include "interpret.h" #include "program.h" #include "program_id.h" #include "object.h" #include "operators.h" #include "module_support.h" #include "config.h" #include "buffer.h" #include "normalize.h" struct comp { const int c1; const int c2; const int c; }; /* 12 bytes/entry */ struct decomp { const int c; const int compat; const int data[2]; }; /* 12 bytes/entry */ struct comp_h { const struct comp *v; struct comp_h *next; }; struct decomp_h { const struct decomp *v; struct decomp_h *next; }; struct canonic_h { const struct canonical_cl *v; struct canonic_h *next; }; struct canonical_cl { const int c; const int cl; }; /* 8 bytes/entry */ /* generated from .txt */ #include "decompositions.h" #include "canonicals.h" static struct comp_h comp_h[sizeof(_c)/sizeof(_c[0])]; static struct comp_h *comp_hash[HSIZE]; static struct decomp_h decomp_h[sizeof(_d)/sizeof(_d[0])]; static struct decomp_h *decomp_hash[HSIZE]; static struct canonic_h canonic_h[sizeof(_ca)/sizeof(_ca[0])]; static struct canonic_h *canonic_hash[HSIZE];
a11a122002-07-16Martin Stjernholm #ifdef PIKE_DEBUG static int hashes_inited = 0; #endif
387cbc2001-06-21Per Hedbor 
c9eefb2014-08-21Martin Nilsson static void init_hashes(void)
387cbc2001-06-21Per Hedbor { unsigned int i;
a11a122002-07-16Martin Stjernholm  #ifdef PIKE_DEBUG
5aad932002-08-15Marcus Comstedt  if (hashes_inited) Pike_fatal ("init_hashes called twice\n");
a11a122002-07-16Martin Stjernholm  hashes_inited = 1; #endif
387cbc2001-06-21Per Hedbor  for( i = 0; i<sizeof(_d)/sizeof(_d[0]); i++ ) {
b8ff3c2014-01-11Arne Goedeke  unsigned int h = (unsigned int)_d[i].c%HSIZE;
387cbc2001-06-21Per Hedbor  decomp_h[i].v = _d+i; decomp_h[i].next = decomp_hash[h]; decomp_hash[h] = decomp_h+i; } for( i = 0; i<sizeof(_c)/sizeof(_c[0]); i++ ) {
b8ff3c2014-01-11Arne Goedeke  unsigned int h = (((unsigned int)_c[i].c1<<16)|_c[i].c2)%HSIZE;
387cbc2001-06-21Per Hedbor  comp_h[i].v = _c+i; comp_h[i].next = comp_hash[h]; comp_hash[h] = comp_h+i; } for( i = 0; i<sizeof(_ca)/sizeof(_ca[0]); i++ ) {
b8ff3c2014-01-11Arne Goedeke  unsigned int h = (unsigned int)_ca[i].c % HSIZE;
387cbc2001-06-21Per Hedbor  canonic_h[i].v = _ca+i; canonic_h[i].next = canonic_hash[h]; canonic_hash[h] = canonic_h+i; } }
c9eefb2014-08-21Martin Nilsson void unicode_normalize_init(void)
387cbc2001-06-21Per Hedbor { init_hashes(); } const struct decomp *get_decomposition( int c ) {
b8ff3c2014-01-11Arne Goedeke  unsigned int hv = (unsigned int)c % HSIZE;
387cbc2001-06-21Per Hedbor  const struct decomp_h *r = decomp_hash[hv]; while( r ) { if( r->v->c == c ) return r->v; r = r->next; } return 0; } int get_canonical_class( int c ) {
b8ff3c2014-01-11Arne Goedeke  unsigned int hv = (unsigned int)c % HSIZE;
387cbc2001-06-21Per Hedbor  const struct canonic_h *r = canonic_hash[hv]; while( r ) { if( r->v->c == c ) return r->v->cl; r = r->next; } return 0; } #define SBase 0xAC00 #define LBase 0x1100 #define VBase 0x1161 #define TBase 0x11A7 #define LCount 19 #define VCount 21 #define TCount 28 #define NCount (VCount * TCount) #define SCount (LCount * NCount)
2faa512001-07-04Per Hedbor int get_compose_pair( int c1, int c2 ) { const struct comp_h *r;
b8ff3c2014-01-11Arne Goedeke  unsigned int hv;
2faa512001-07-04Per Hedbor  if( c1 >= LBase ) { /* Perhaps hangul */ int LIndex = c1-LBase, SIndex; if( LIndex < LCount ) { int VIndex = c2-VBase; if( 0 <= VIndex && VIndex < VCount ) return SBase + (LIndex*VCount + VIndex)*TCount; } if( c1 >= SBase ) { SIndex = c1-SBase; if( SIndex < SCount && (SIndex % TCount)== 0 ) { int TIndex = c2-TBase; if( 0 <= TIndex && TIndex <= TCount ) /* LVT */ return c1+TIndex; } } }
b8ff3c2014-01-11Arne Goedeke  hv = (unsigned int)c1 << 16 | (unsigned int)c2;
2faa512001-07-04Per Hedbor  /* Nope. Not hangul. */
b8ff3c2014-01-11Arne Goedeke  for( r=comp_hash[ hv % HSIZE ]; r; r=r->next )
2faa512001-07-04Per Hedbor  if( (r->v->c1 == c1) && (r->v->c2 == c2) ) return r->v->c; return 0; }
387cbc2001-06-21Per Hedbor static void rec_get_decomposition( int canonical, int c, struct buffer *tmp ) {
6c34282004-04-11Per Hedbor  const struct decomp *decomp; if( (decomp = get_decomposition( c )) && !(canonical && decomp->compat) )
387cbc2001-06-21Per Hedbor  { if( decomp->data[0] ) rec_get_decomposition( canonical, decomp->data[0], tmp ); if( decomp->data[1] ) rec_get_decomposition( canonical, decomp->data[1], tmp ); } else { if( (c >= SBase) && c < (SBase+SCount) ) /* Hangul */ { int l, v, t; c-=SBase; l= LBase + c / NCount; v = VBase + (c % NCount) / TCount; t = TBase + (c % TCount); uc_buffer_write( tmp, l ); uc_buffer_write( tmp, v ); if( t != TBase ) uc_buffer_write( tmp, t ); } else uc_buffer_write( tmp, c ); } } struct buffer *unicode_decompose_buffer( struct buffer *source, int how ) { unsigned int i, j; struct buffer *res = uc_buffer_new(); struct buffer *tmp = uc_buffer_new(); int canonical = !(how & COMPAT_BIT); for( i = 0; i<source->size; i++ ) {
6c34282004-04-11Per Hedbor  if( source->data[i] < 160 ) { uc_buffer_write( res, source->data[i] ); } else
387cbc2001-06-21Per Hedbor  {
6c34282004-04-11Per Hedbor  tmp->size = 0; rec_get_decomposition( canonical, source->data[i], tmp ); for( j = 0; j<tmp->size; j++ )
387cbc2001-06-21Per Hedbor  {
6c34282004-04-11Per Hedbor  int c = tmp->data[j]; int cl = get_canonical_class( c ); int k = res->size; /* Sort combining marks */ if( cl != 0 ) { for( ; k > 0; k-- ) if( get_canonical_class( res->data[k-1] ) <= cl ) break; } uc_buffer_insert( res, k, c );
387cbc2001-06-21Per Hedbor  } } } uc_buffer_free( tmp ); uc_buffer_free( source ); return res; }
74dfe82012-12-30Jonas Walldén struct buffer *unicode_compose_buffer( struct buffer *source, int UNUSED(how) )
387cbc2001-06-21Per Hedbor { int startch = source->data[0]; int lastclass = get_canonical_class( startch )?256:0; unsigned int startpos = 0, comppos=1; unsigned int pos;
13670c2015-05-25Martin Nilsson 
387cbc2001-06-21Per Hedbor  for( pos = 1; pos < source->size; pos++ ) {
2faa512001-07-04Per Hedbor  int ch = source->data[ pos ];
387cbc2001-06-21Per Hedbor  int cl = get_canonical_class( ch );
2faa512001-07-04Per Hedbor  int co = get_compose_pair( startch, ch );
387cbc2001-06-21Per Hedbor  if( co && ((lastclass < cl) || (lastclass == 0)) ) source->data[ startpos ] = startch = co; else { if( cl == 0 ) { startpos = comppos; startch = ch; } lastclass = cl; source->data[comppos++] = ch; } } source->size = comppos;
2faa512001-07-04Per Hedbor  return source;
387cbc2001-06-21Per Hedbor } struct pike_string *unicode_normalize( struct pike_string *source, int how ) {
6178ca2001-11-22Henrik Grubbström (Grubba)  /* Special case for the empty string. */ if (!source->len) { add_ref(source); return source; }
a5c4432001-07-04Per Hedbor  /* What, me lisp? */
015f432018-03-01Henrik Grubbström (Grubba)  if( how & COMPOSE_BIT ) { if (!source->size_shift && !(how & COMPAT_BIT)) { /* NB: There are 8-bit characters that are changed in * compat mode; eg NBSP (0xA0) and DIAERESIS (0xA8). */ add_ref(source); return source; }
a5c4432001-07-04Per Hedbor  return uc_buffer_to_pikestring( unicode_compose_buffer( unicode_decompose_buffer( uc_buffer_write_pikestring( uc_buffer_new(), source ), how ), how ) );
015f432018-03-01Henrik Grubbström (Grubba)  }
a5c4432001-07-04Per Hedbor  return uc_buffer_to_pikestring( unicode_decompose_buffer( uc_buffer_write_pikestring( uc_buffer_new(), source ), how ) );
387cbc2001-06-21Per Hedbor }