e576bb2002-10-11Martin Nilsson /* || This file is part of Pike. For copyright information see COPYRIGHT. || Pike is distributed under GPL, LGPL and MPL. See the file COPYING || for more information. */
1b10db2002-10-08Martin Nilsson 
387cbc2001-06-21Per Hedbor #include "global.h" #include "stralloc.h" #include "pike_macros.h" #include "interpret.h" #include "program.h" #include "program_id.h" #include "object.h" #include "operators.h" #include "module_support.h" #include "config.h" #include "buffer.h" #include "split.h"
6c34282004-04-11Per Hedbor static inline struct words *uc_words_mkspace( struct words *d, int n )
387cbc2001-06-21Per Hedbor { while( d->size+n > d->allocated_size ) {
6c34282004-04-11Per Hedbor  d->allocated_size *= 2;
387cbc2001-06-21Per Hedbor  d = realloc( d, d->allocated_size*8+(sizeof(struct words)-8) ); } return d; } struct words *uc_words_write( struct words *d, unsigned int start, unsigned int len ) { d = uc_words_mkspace( d,1 ); d->words[d->size].start = start; d->words[d->size].size = len; d->size++; return d; }
9800f12017-06-21Martin Nilsson struct words *uc_words_new( void )
387cbc2001-06-21Per Hedbor { struct words *s = malloc( sizeof( struct words ) + 31*8 ); s->allocated_size = 32; s->size = 0; return s; } void uc_words_free( struct words *w ) { free( w ); } #include "wordbits.h"
6c34282004-04-11Per Hedbor static inline int _unicode_is_wordchar( int c )
387cbc2001-06-21Per Hedbor { unsigned int i; for( i = 0; i<sizeof(ranges)/sizeof(ranges[0]); i++ ) if( c <= ranges[i].end )
75b8452006-02-28Marcus Comstedt  return (c>=ranges[i].start? ( (c >= 0x3400 && c <= 0x9fff) || (c >= 0x20000 && c <= 0x2ffff) ? /* CJK */ 2 : 1 ) : 0);
387cbc2001-06-21Per Hedbor  return 0; }
6c34282004-04-11Per Hedbor int unicode_is_wordchar( int c ) { return _unicode_is_wordchar(c); } struct words *unicode_split_words_pikestr0( struct pike_string *data )
387cbc2001-06-21Per Hedbor { unsigned int i; unsigned int in_word = 0; unsigned int last_start = 0; struct words *res = uc_words_new();
143cd12008-08-05Martin Stjernholm  p_wchar0 *ptr = STR0 (data);
6c34282004-04-11Per Hedbor  unsigned int sz = data->len;
13670c2015-05-25Martin Nilsson 
6c34282004-04-11Per Hedbor  for( i=0; i<sz; i++, ptr++ ) { switch( _unicode_is_wordchar( *ptr ) ) { case 1: /* normal */ if( *ptr > 127 ) { uc_words_free( res ); return NULL; }
ee90a92007-11-10Martin Nilsson 
6c34282004-04-11Per Hedbor  if( !in_word ) { last_start = i; in_word=1; } break; case 0: /* not */ if( in_word ) { in_word=0; res = uc_words_write( res, last_start, i-last_start ); } break; } } if( in_word ) return uc_words_write( res, last_start, i-last_start ); return res; }
387cbc2001-06-21Per Hedbor 
6c34282004-04-11Per Hedbor struct words *unicode_split_words_buffer( struct buffer *data ) { unsigned int i; unsigned int in_word = 0; unsigned int last_start = 0; struct words *res = uc_words_new();
143cd12008-08-05Martin Stjernholm  p_wchar2 *ptr = data->data;
6c34282004-04-11Per Hedbor  unsigned int sz = data->size; for( i=0; i<sz; i++ )
387cbc2001-06-21Per Hedbor  {
6c34282004-04-11Per Hedbor  switch( _unicode_is_wordchar( *ptr++ ) )
387cbc2001-06-21Per Hedbor  { case 1: /* normal */ if( !in_word ) { last_start = i; in_word=1; } break; case 0: /* not */ if( in_word ) { in_word=0; res = uc_words_write( res, last_start, i-last_start ); } break; case 2: /* single character word */ if( in_word ) { in_word=0; res = uc_words_write( res, last_start, i-last_start ); } res = uc_words_write( res, i, 1 ); } } if( in_word ) return uc_words_write( res, last_start, i-last_start ); return res; }