src/libcaptive/rtl/unicode.c

   1 /* $Id$
   2  * Unicode add-ons to reactos ntoskrnl/rtl/unicode.c for libcaptive
   3  * Copyright (C) 2002 Jan Kratochvil <project-captive@jankratochvil.net>
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; exactly version 2 of June 1991 is required
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write to the Free Software
  16  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  17  */
  18
  19
  20 #include "config.h"
  21
  22 #include "captive/unicode.h"    /* self */
  23 #include "captive/unicode_reactos.h"    /* for captive_ucs2 */
  24 #include <glib/gtypes.h>
  25 #include <glib/gmessages.h>
  26 #include <glib/gunicode.h>
  27 #include <glib/gmem.h>
  28 #include "reactos/napi/types.h"  /* for PUNICODE_STRING etc. */
  29 #include "reactos/unicode.h"
  30 #include "captive/macros.h"
  31 #include <glib/gstrfuncs.h>
  32 #include <wchar.h>      /* for wcslen() */
  33 #include <glib/ghash.h>
  34 #include <string.h>
  35
  36
  37 /* CONFIG: */
  38 /* Use simplified g_malloc() functions as wrappers around g_alloca() ones.
  39  */
  40 #define FUNCMALLOC_FROM_ALLOCA 1
  41
  42
  43 /* compiler sanity */
  44 static gboolean captive_validate_unicode_types(void)
  45 {
  46         g_return_val_if_fail(4==sizeof(gunichar),FALSE);
  47         g_return_val_if_fail(2==sizeof(WCHAR),FALSE);
  48         g_return_val_if_fail(1==sizeof(CHAR),FALSE);
  49
  50         return TRUE;
  51 }
  52
  53
  54 /**
  55  * captive_validate_ucs4:
  56  * @string_ucs4: #const #gunichar * type string to validate.
  57  * Invalid string input is forbidden.
  58  *
  59  * Checks the validity of all 32-bit unicharacters of 0-terminated string.
  60  * It is required to have characters complying to g_unichar_validate().
  61  *
  62  * Returns: %TRUE if the string is valid.
  63  */
  64 gboolean captive_validate_ucs4(const gunichar *string_ucs4)
  65 {
  66 const gunichar *cs_ucs4;
  67
  68         g_return_val_if_fail(captive_validate_unicode_types(),FALSE);
  69         g_return_val_if_fail(string_ucs4!=NULL,FALSE);
  70
  71         for (cs_ucs4=string_ucs4;*cs_ucs4;cs_ucs4++)
  72                 g_return_val_if_fail(g_unichar_validate(*cs_ucs4),FALSE);
  73
  74         return TRUE;
  75 }
  76
  77
  78 /**
  79  * captive_validate_ucs2:
  80  * @string_ucs2: #const #captive_ucs2 * type string to validate.
  81  * Invalid string input is forbidden.
  82  * UTF-16 encoded strings are forbidden.
  83  *
  84  * Checks the validity of all 16-bit unicharacters of 0-terminated string.
  85  * It is required to have characters complying to g_unichar_validate().
  86  *
  87  * Returns: %TRUE if the string is valid.
  88  */
  89 gboolean captive_validate_ucs2(const captive_ucs2 *string_ucs2)
  90 {
  91 const captive_ucs2 *cs_ucs2;
  92
  93         g_return_val_if_fail(captive_validate_unicode_types(),FALSE);
  94         g_return_val_if_fail(string_ucs2!=NULL,FALSE);
  95
  96         /* g_unichar_validate() will reject surrogates (G_UNICODE_SURROGATE) */
  97         for (cs_ucs2=string_ucs2;*cs_ucs2;cs_ucs2++)
  98                 g_return_val_if_fail(g_unichar_validate(*cs_ucs2),FALSE);
  99
 100         return TRUE;
 101 }
 102
 103
 104 /**
 105  * captive_validate_utf8:
 106  * @string_utf8: #const #gchar * utf8 type string to validate.
 107  * Invalid string input is forbidden.
 108  *
 109  * Checks the validity of all utf8 of 0-terminated string.
 110  * It is required to have characters complying to g_utf8_validate().
 111  *
 112  * Returns: %TRUE if the string is valid.
 113  */
 114 gboolean captive_validate_utf8(const gchar *string_utf8)
 115 {
 116         g_return_val_if_fail(captive_validate_unicode_types(),FALSE);
 117         g_return_val_if_fail(string_utf8!=NULL,FALSE);
 118
 119         g_return_val_if_fail(g_utf8_validate(
 120                                         string_utf8,    /* str */
 121                                         -1,     /* max_len; -1 means '\0'-terminated */
 122                                         NULL),  /* end */
 123                         FALSE);
 124
 125         return TRUE;
 126 }
 127
 128
 129 /**
 130  * captive_ucs2_strlen:
 131  * @string_ucs2: String of type #const #gunichar2 * in pure UCS-2
 132  * Invalid string input is forbidden. UTF-16 encoded pairs are forbidden.
 133  *
 134  * Counts the number of characters (=2bytes) in @strings_ucs2.
 135  *
 136  * Returns: @string_ucs2 length in UCS-2 characters.
 137  */
 138 glong captive_ucs2_strlen(const captive_ucs2 *string_ucs2)
 139 {
 140 glong r;
 141
 142         g_return_val_if_fail(captive_validate_ucs2(string_ucs2),0);
 143
 144         for (r=0;*string_ucs2;string_ucs2++)
 145                 r++;
 146
 147         return r;
 148 }
 149
 150
 151 /**
 152  * captive_validate_UnicodeString:
 153  * @string_UnicodeString: #PUNICODE_STRING type string to validate.
 154  * Invalid string input is forbidden.
 155  *
 156  * Checks the internal consistency of the given @string_UnicodeString.
 157  * It is required to have characters complying to g_unichar_validate().
 158  *
 159  * Returns: %TRUE if the string is valid.
 160  */
 161 gboolean captive_validate_UnicodeString(const UNICODE_STRING *string_UnicodeString)
 162 {
 163         g_return_val_if_fail(captive_validate_unicode_types(),FALSE);
 164         g_return_val_if_fail(sizeof(WCHAR)==sizeof(*string_UnicodeString->Buffer),FALSE);
 165         g_return_val_if_fail(string_UnicodeString!=NULL,FALSE);
 166         g_return_val_if_fail(string_UnicodeString->Length%sizeof(*string_UnicodeString->Buffer)==0,FALSE);
 167         g_return_val_if_fail(string_UnicodeString->MaximumLength
 168                         >=string_UnicodeString->Length+sizeof(*string_UnicodeString->Buffer),FALSE);
 169         g_return_val_if_fail(string_UnicodeString->Length==sizeof(*string_UnicodeString->Buffer)*
 170                         captive_ucs2_strlen(string_UnicodeString->Buffer)
 171                         ,FALSE);
 172
 173         g_return_val_if_fail(captive_validate_ucs2(string_UnicodeString->Buffer),FALSE);
 174
 175         return TRUE;
 176 }
 177
 178
 179 /**
 180  * captive_validate_AnsiString:
 181  * @string_AnsiString: #PANSI_STRING type string to validate.
 182  * Invalid string input is forbidden.
 183  *
 184  * Checks the internal consistency of the given @string_AnsiString.
 185  *
 186  * Returns: %TRUE if the string is valid.
 187  */
 188 gboolean captive_validate_AnsiString(const ANSI_STRING *string_AnsiString)
 189 {
 190         g_return_val_if_fail(captive_validate_unicode_types(),FALSE);
 191         g_return_val_if_fail(sizeof(CHAR)==sizeof(*string_AnsiString->Buffer),FALSE);
 192         g_return_val_if_fail(string_AnsiString!=NULL,FALSE);
 193         g_return_val_if_fail(string_AnsiString->MaximumLength>=string_AnsiString->Length+1,FALSE);
 194         g_return_val_if_fail(string_AnsiString->Length==strlen(string_AnsiString->Buffer),FALSE);
 195
 196         return TRUE;
 197 }
 198
 199
 200 /* detect required memory size for g_alloca() */
 201 size_t _captive_UnicodeString_to_utf8_alloca_internal_sizeof(const UNICODE_STRING *string_UnicodeString)
 202 {
 203 glong length;
 204 size_t r;
 205 const WCHAR *cwcharp;
 206
 207         g_return_val_if_fail(captive_validate_UnicodeString(string_UnicodeString),1);
 208
 209         /* measure 'string_UnicodeString->Buffer' length in UTF-8 to 'r' */
 210         cwcharp=string_UnicodeString->Buffer;
 211         r=0;
 212         for (length=string_UnicodeString->Length/sizeof(*string_UnicodeString->Buffer);length;length--) {
 213 gint utf8len;
 214
 215                 utf8len=g_unichar_to_utf8(
 216                                 *cwcharp++,     /* c */
 217                                 NULL);  /* outbuf=NULL => just the length will be computed */
 218                 g_assert(utf8len>=0);
 219                 r+=utf8len;
 220                 }
 221         g_assert(*cwcharp==0);
 222         r++;    /* '\0'-termination */
 223
 224         /* utf8 byte-size */
 225         return r;
 226 }
 227
 228 /* transfer 'string_UnicodeString' to memory in 'mem' as utf8 w/o any further allocations */
 229 void _captive_UnicodeString_to_utf8_alloca_internal_fill(gchar *mem,const UNICODE_STRING *string_UnicodeString)
 230 {
 231 const WCHAR *cwcharp;
 232 #ifndef G_DISABLE_ASSERT
 233 gchar *mem_orig=mem;
 234 #endif /* G_DISABLE_ASSERT */
 235
 236         g_return_if_fail(mem!=NULL);
 237         if (!captive_validate_UnicodeString(string_UnicodeString)) {
 238                 *mem='\0';
 239                 g_return_if_reached();
 240                 }
 241
 242         /* We can't use any glib string conversions as UNICODE_STRING uses ucs2! */
 243         /* We can't use any glib string conversions as we need to write the string
 244          * to our supplied memory storage but glib always g_malloc()s it
 245          */
 246         /* copy 'string_UnicodeString->Buffer' to 'mem' */
 247         for (cwcharp=string_UnicodeString->Buffer;*cwcharp;cwcharp++) {
 248 gint utf8len;
 249
 250                 utf8len=g_unichar_to_utf8(
 251                                 (gunichar)*cwcharp,     /* c */
 252                                 mem);   /* outbuf */
 253                 g_assert(utf8len>=0);
 254                 mem+=utf8len;
 255                 }
 256         *mem='\0';
 257
 258         g_assert((size_t)((mem+1)-mem_orig) == _captive_UnicodeString_to_utf8_alloca_internal_sizeof(string_UnicodeString));
 259         g_assert(captive_validate_utf8(mem_orig));
 260 }
 261
 262
 263 /**
 264  * captive_UnicodeString_to_utf8_malloc:
 265  * @string_UnicodeString: #PUNICODE_STRING type of string to convert.
 266  *
 267  * g_malloc()-based conversion from #PUNICODE_STRING to plain #utf8 string.
 268  * You must free the result with g_free() function.
 269  *
 270  * Returns: #const #gchar * g_malloc()ed converted string @string_UnicodeString.
 271  */
 272 gchar *captive_UnicodeString_to_utf8_malloc(const UNICODE_STRING *string_UnicodeString)
 273 {
 274 gchar *r;
 275 #ifndef FUNCMALLOC_FROM_ALLOCA
 276 glong utf16_read,utf8_written;
 277 GError *err;
 278 #endif /* !FUNCMALLOC_FROM_ALLOCA */
 279
 280         g_return_val_if_fail(captive_validate_UnicodeString(string_UnicodeString),g_strdup(""));
 281
 282 #ifdef FUNCMALLOC_FROM_ALLOCA
 283
 284         r=g_malloc(_captive_UnicodeString_to_utf8_alloca_internal_sizeof(string_UnicodeString));
 285         _captive_UnicodeString_to_utf8_alloca_internal_fill(r,string_UnicodeString);
 286
 287 #else
 288
 289         err=NULL;       /* not precleared by g_utf8_to_utf16()! */
 290         r=g_utf16_to_utf8(
 291                         (const gunichar2 *)string_UnicodeString->Buffer,        /* str */
 292                         -1,     /* len=>'\0'-terminated */
 293                         &utf16_read,    /* items_read; counted in unichar2 (NOT UTF-16 characters or bytes!) */
 294                         &utf8_written,  /* items_written; counted in bytes (NOT UTF-8 characters!) */
 295                         &err);
 296         if (err) {
 297                 g_warning("%s: utf16_read=%ld,utf8_written=%ld: %s",G_STRLOC,
 298                                 (long)utf16_read,(long)utf8_written,err->message);
 299                 g_error_free(err);
 300                 g_assert(r==NULL);
 301                 g_return_val_if_reached(g_strdup(""));
 302                 }
 303         g_assert(r!=NULL);
 304
 305         g_assert(utf16_read==(glong)(string_UnicodeString->length/sizeof(*string_UnicodeString->Buffer)));
 306         g_assert(utf6_written==strlen(r));
 307
 308 #endif /* !FUNCMALLOC_FROM_ALLOCA */
 309
 310         g_assert(captive_validate_utf8(r));
 311
 312         return r;
 313 }
 314
 315
 316 /* detect required memory size for g_alloca() */
 317 size_t _captive_utf8_to_UnicodeString_alloca_internal_sizeof(const gchar *string_utf8)
 318 {
 319         g_return_val_if_fail(captive_validate_utf8(string_utf8),1);
 320
 321         /* find the value for PUNICODE_STRING->MaximumLength */
 322         return 0
 323                         +sizeof(UNICODE_STRING)
 324                         +sizeof(WCHAR)*(g_utf8_strlen(string_utf8,
 325                                         -1      /* max; -1 means '\0'-terminated */
 326                                         )+1);   /* '\0'-termination */
 327 }
 328
 329 static void terminate_static_UnicodeString(UNICODE_STRING *string_UnicodeString,glong length)
 330 {
 331         /* 'string_UnicodeString' is not yet valid in this point! */
 332         g_return_if_fail(string_UnicodeString!=NULL);
 333         g_return_if_fail(length>=0);
 334
 335         string_UnicodeString->Length=length*sizeof(WCHAR);
 336         string_UnicodeString->MaximumLength=(length+1)*sizeof(WCHAR);
 337         string_UnicodeString->Buffer[length]=0;
 338
 339         g_assert(captive_validate_UnicodeString(string_UnicodeString));
 340 }
 341
 342 /* transfer 'string_UnicodeString' to memory in 'mem' w/o any further allocations */
 343 void _captive_utf8_to_UnicodeString_alloca_internal_fill(UNICODE_STRING *mem,const gchar *string_utf8)
 344 {
 345 gunichar2 *utf16;
 346 captive_ucs2 *ucs2;
 347 glong utf8_read,utf16_written;
 348 GError *err;
 349
 350         g_return_if_fail(mem!=NULL);
 351         mem->Buffer=(PWSTR)(((char *)mem)+sizeof(*mem));        /* for terminate_static_UnicodeString() below */
 352         if (!captive_validate_utf8(string_utf8)) {
 353                 terminate_static_UnicodeString(mem,0);
 354                 g_return_if_reached();
 355                 }
 356
 357         err=NULL;       /* not precleared by g_utf8_to_utf16()! */
 358         utf16=g_utf8_to_utf16(
 359                         string_utf8,    /* str */
 360                         -1,     /* len=>'\0'-terminated */
 361                         &utf8_read,     /* items_read; counted in bytes (NOT chars!) */
 362                         &utf16_written, /* items_written; counted in UTF-16 characters (NOT unichar2 or bytes!) */
 363                         &err);
 364         if (err) {
 365                 g_warning("%s: utf8_read=%ld,utf16_written=%ld: %s",G_STRLOC,
 366                                 (long)utf8_read,(long)utf16_written,err->message);
 367                 g_error_free(err);
 368                 g_assert(utf16==NULL);
 369                 terminate_static_UnicodeString(mem,0);
 370                 g_return_if_reached();
 371                 }
 372         g_assert(utf16!=NULL);
 373
 374         /* Check for UCS-2 compliance (reject if surrogates inside) */
 375         g_assert(captive_validate_ucs2((const captive_ucs2 *)utf16));
 376         /* valid UCS-2 */
 377         ucs2=(captive_ucs2 *)utf16;
 378
 379         g_assert(utf8_read==(glong)strlen(string_utf8));
 380         g_assert(utf16_written==captive_ucs2_strlen(ucs2));
 381
 382         /* check of validity of _captive_utf8_to_UnicodeString_alloca_internal_sizeof() result */
 383         g_assert((gchar *)(mem->Buffer+(utf16_written+1))       /* +1 => '\0'-termination */
 384                         == ((gchar *)mem)+_captive_utf8_to_UnicodeString_alloca_internal_sizeof(string_utf8));
 385
 386         memcpy(mem->Buffer,ucs2,sizeof(WCHAR)*(utf16_written+1));
 387         g_free(ucs2);
 388         terminate_static_UnicodeString(mem,utf16_written);
 389
 390         g_assert(captive_validate_UnicodeString(mem));
 391 }
 392
 393
 394 /**
 395  * captive_utf8_to_UnicodeString_malloc:
 396  * @string_utf8: #const #gchar * string in #utf8 to convert.
 397  *
 398  * g_malloc()-based conversion from plain #utf8 string to #PUNICODE_STRING.
 399  * You must free the result with g_free() function.
 400  *
 401  * Returns: #PUNICODE_STRING g_malloc()ed converted string @string_utf8.
 402  */
 403 PUNICODE_STRING captive_utf8_to_UnicodeString_malloc(const gchar *string_utf8)
 404 {
 405 UNICODE_STRING *r;
 406 #ifndef FUNCMALLOC_FROM_ALLOCA
 407 gunichar *ucs4;
 408 glong utf8_read,ucs4_written;
 409 GError *err;
 410 #endif /* !FUNCMALLOC_FROM_ALLOCA */
 411
 412         g_return_val_if_fail(captive_validate_utf8(string_utf8),captive_utf8_to_UnicodeString_malloc(""));
 413
 414 #ifdef FUNCMALLOC_FROM_ALLOCA
 415
 416         r=g_malloc(_captive_utf8_to_UnicodeString_alloca_internal_sizeof(string_utf8));
 417         _captive_utf8_to_UnicodeString_alloca_internal_fill(r,string_utf8);
 418
 419 #else
 420
 421 #error "FIXME: NOT IMPLEMENTED"
 422
 423 #endif /* !FUNCMALLOC_FROM_ALLOCA */
 424
 425         g_assert(captive_validate_UnicodeString(r));
 426
 427         return r;
 428 }
 429
 430
 431 /* map: (const gunichar *) -> (const gunichar2 *); UCS-4 -> UTF-16 */
 432 static GHashTable *captive_ucs4_to_utf16_hash;
 433
 434 static void captive_ucs4_to_utf16_hash_init(void)
 435 {
 436         if (captive_ucs4_to_utf16_hash)
 437                 return;
 438         captive_ucs4_to_utf16_hash=g_hash_table_new_full(
 439                         g_direct_hash,  /* hash_func */
 440                         g_direct_equal, /* key_equal_func */
 441                         (GDestroyNotify)NULL,   /* key_destroy_func; we require persistent strings as input */
 442                         (GDestroyNotify)g_free);        /* value_destroy_func; result of g_ucs4_to_utf16() */
 443 }
 444
 445 /**
 446  * captive_ucs4_to_utf16_const:
 447  * @string_ucs4: #const #gunichar * type of persistent string to convert.
 448  * This string MUST remain readable with the same content forever.
 449  *
 450  * Constant string conversion from 32-bit #wchar_t to 16-bit (possible pairs of) UTF-16.
 451  * You may not modify the result in any way.
 452  *
 453  * It is guaranteed to get two different string addresses for two different
 454  * input addresses even if the input strings content is the same.
 455  * Otherwise we would behave as #GCC option %-fmerge-constants which
 456  * results in %C non-conforming behaviour.
 457  *
 458  * FIXME: UTF-16 encoding IS NOT IMPLEMENTED.
 459  *
 460  * See also captive_ucs4_to_ucs2_const().
 461  *
 462  * Returns: #const #gunichar2 * converted string @string_ucs4.
 463  */
 464 const gunichar2 *captive_ucs4_to_utf16_const(const gunichar *string_ucs4)
 465 {
 466 glong ucs4_read,utf16_written;
 467 GError *err;
 468 const gunichar2 *r_lookup;
 469 gunichar2 *r;
 470
 471         g_return_val_if_fail(captive_validate_ucs4(string_ucs4),captive_ucs4_to_utf16_const((const gunichar *)L""));
 472
 473         captive_ucs4_to_utf16_hash_init();
 474
 475         /* found already existing item in the table */
 476         if ((r_lookup=g_hash_table_lookup(captive_ucs4_to_utf16_hash,
 477                         string_ucs4)    /* key */
 478                         )) {
 479                 return r_lookup;
 480                 }
 481
 482         /* Prepare 'r' as UTF-16 */
 483         err=NULL;       /* not precleared by g_ucs4_to_utf16()! */
 484         r=g_ucs4_to_utf16(
 485                         (const gunichar *)string_ucs4,  /* str */
 486                         -1,     /* len; -1 means '\0'-termination */
 487                         &ucs4_read,     /* items_read; counted in chars (==unichars; NOT bytes!) */
 488                         &utf16_written, /* items_written; counted in gunichar2 (NOT chars or bytes!) */
 489                         &err);
 490         if (err) {
 491                 g_warning("%s: ucs4_read=%ld,utf16_written=%ld: %s",G_STRLOC,
 492                                 (long)ucs4_read,(long)utf16_written,err->message);
 493                 g_error_free(err);
 494                 g_assert(r==NULL);
 495                 g_return_val_if_reached(captive_ucs4_to_utf16_const((const gunichar *)L""));
 496                 }
 497         g_assert(r!=NULL);
 498         g_assert(ucs4_read==(glong)wcslen((const wchar_t *)string_ucs4));
 499         /* FIXME: We don't have captive_utf16_strlen() */
 500         g_assert(utf16_written==(glong)captive_ucs2_strlen((const gunichar2 *)r));
 501         /* (ucs4_read==utf16_written) check would discard any double-pair UTF-16 encodings
 502          * but this function is designed as UTF-16 compliant.
 503          */
 504
 505         /* store new item to the table */
 506         g_hash_table_insert(captive_ucs4_to_utf16_hash,
 507                         (gpointer)string_ucs4,  /* key; de-const */
 508                         r);     /* value */
 509
 510 #if 0   /* We don't have captive_validate_utf16() */
 511         g_assert(captive_validate_utf16(r));
 512 #endif
 513
 514         return r;
 515 }