2 * Unicode add-ons to reactos ntoskrnl/rtl/unicode.c for libcaptive
3 * Copyright (C) 2002 Jan Kratochvil <project-captive@jankratochvil.net>
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; exactly version 2 of June 1991 is required
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 #include "captive/unicode.h" /* self */
23 #include "captive/unicode_reactos.h" /* for captive_ucs2 */
24 #include <glib/gtypes.h>
25 #include <glib/gmessages.h>
26 #include <glib/gunicode.h>
27 #include <glib/gmem.h>
28 #include "reactos/napi/types.h" /* for PUNICODE_STRING etc. */
29 #include "reactos/unicode.h"
30 #include "captive/macros.h"
31 #include <glib/gstrfuncs.h>
32 #include <wchar.h> /* for wcslen() */
33 #include <glib/ghash.h>
38 /* Use simplified g_malloc() functions as wrappers around g_alloca() ones.
40 #define FUNCMALLOC_FROM_ALLOCA 1
44 static gboolean captive_validate_unicode_types(void)
46 g_return_val_if_fail(4==sizeof(gunichar),FALSE);
47 g_return_val_if_fail(2==sizeof(WCHAR),FALSE);
48 g_return_val_if_fail(1==sizeof(CHAR),FALSE);
55 * captive_validate_ucs4:
56 * @string_ucs4: #const #gunichar * type string to validate.
57 * Invalid string input is forbidden.
59 * Checks the validity of all 32-bit unicharacters of 0-terminated string.
60 * It is required to have characters complying to g_unichar_validate().
62 * Returns: %TRUE if the string is valid.
64 gboolean captive_validate_ucs4(const gunichar *string_ucs4)
66 const gunichar *cs_ucs4;
68 g_return_val_if_fail(captive_validate_unicode_types(),FALSE);
69 g_return_val_if_fail(string_ucs4!=NULL,FALSE);
71 for (cs_ucs4=string_ucs4;*cs_ucs4;cs_ucs4++)
72 g_return_val_if_fail(g_unichar_validate(*cs_ucs4),FALSE);
79 * captive_validate_ucs2:
80 * @string_ucs2: #const #captive_ucs2 * type string to validate.
81 * Invalid string input is forbidden.
82 * UTF-16 encoded strings are forbidden.
84 * Checks the validity of all 16-bit unicharacters of 0-terminated string.
85 * It is required to have characters complying to g_unichar_validate().
87 * Returns: %TRUE if the string is valid.
89 gboolean captive_validate_ucs2(const captive_ucs2 *string_ucs2)
91 const captive_ucs2 *cs_ucs2;
93 g_return_val_if_fail(captive_validate_unicode_types(),FALSE);
94 g_return_val_if_fail(string_ucs2!=NULL,FALSE);
96 /* g_unichar_validate() will reject surrogates (G_UNICODE_SURROGATE) */
97 for (cs_ucs2=string_ucs2;*cs_ucs2;cs_ucs2++)
98 g_return_val_if_fail(g_unichar_validate(*cs_ucs2),FALSE);
105 * captive_validate_utf8:
106 * @string_utf8: #const #gchar * utf8 type string to validate.
107 * Invalid string input is forbidden.
109 * Checks the validity of all utf8 of 0-terminated string.
110 * It is required to have characters complying to g_utf8_validate().
112 * Returns: %TRUE if the string is valid.
114 gboolean captive_validate_utf8(const gchar *string_utf8)
116 g_return_val_if_fail(captive_validate_unicode_types(),FALSE);
117 g_return_val_if_fail(string_utf8!=NULL,FALSE);
119 g_return_val_if_fail(g_utf8_validate(
120 string_utf8, /* str */
121 -1, /* max_len; -1 means '\0'-terminated */
130 * captive_ucs2_strlen:
131 * @string_ucs2: String of type #const #gunichar2 * in pure UCS-2
132 * Invalid string input is forbidden. UTF-16 encoded pairs are forbidden.
134 * Counts the number of characters (=2bytes) in @strings_ucs2.
136 * Returns: @string_ucs2 length in UCS-2 characters.
138 glong captive_ucs2_strlen(const captive_ucs2 *string_ucs2)
142 g_return_val_if_fail(captive_validate_ucs2(string_ucs2),0);
144 for (r=0;*string_ucs2;string_ucs2++)
152 * captive_validate_UnicodeString:
153 * @string_UnicodeString: #PUNICODE_STRING type string to validate.
154 * Invalid string input is forbidden.
156 * Checks the internal consistency of the given @string_UnicodeString.
157 * It is required to have characters complying to g_unichar_validate().
159 * Returns: %TRUE if the string is valid.
161 gboolean captive_validate_UnicodeString(const UNICODE_STRING *string_UnicodeString)
163 g_return_val_if_fail(captive_validate_unicode_types(),FALSE);
164 g_return_val_if_fail(sizeof(WCHAR)==sizeof(*string_UnicodeString->Buffer),FALSE);
165 g_return_val_if_fail(string_UnicodeString!=NULL,FALSE);
166 g_return_val_if_fail(string_UnicodeString->Length%sizeof(*string_UnicodeString->Buffer)==0,FALSE);
167 g_return_val_if_fail(string_UnicodeString->MaximumLength
168 >=string_UnicodeString->Length+sizeof(*string_UnicodeString->Buffer),FALSE);
169 g_return_val_if_fail(string_UnicodeString->Length==sizeof(*string_UnicodeString->Buffer)*
170 captive_ucs2_strlen(string_UnicodeString->Buffer)
173 g_return_val_if_fail(captive_validate_ucs2(string_UnicodeString->Buffer),FALSE);
180 * captive_validate_AnsiString:
181 * @string_AnsiString: #PANSI_STRING type string to validate.
182 * Invalid string input is forbidden.
184 * Checks the internal consistency of the given @string_AnsiString.
186 * Returns: %TRUE if the string is valid.
188 gboolean captive_validate_AnsiString(const ANSI_STRING *string_AnsiString)
190 g_return_val_if_fail(captive_validate_unicode_types(),FALSE);
191 g_return_val_if_fail(sizeof(CHAR)==sizeof(*string_AnsiString->Buffer),FALSE);
192 g_return_val_if_fail(string_AnsiString!=NULL,FALSE);
193 g_return_val_if_fail(string_AnsiString->MaximumLength>=string_AnsiString->Length+1,FALSE);
194 g_return_val_if_fail(string_AnsiString->Length==strlen(string_AnsiString->Buffer),FALSE);
200 /* detect required memory size for g_alloca() */
201 size_t _captive_UnicodeString_to_utf8_alloca_internal_sizeof(const UNICODE_STRING *string_UnicodeString)
205 const WCHAR *cwcharp;
207 g_return_val_if_fail(captive_validate_UnicodeString(string_UnicodeString),1);
209 /* measure 'string_UnicodeString->Buffer' length in UTF-8 to 'r' */
210 cwcharp=string_UnicodeString->Buffer;
212 for (length=string_UnicodeString->Length/sizeof(*string_UnicodeString->Buffer);length;length--) {
215 utf8len=g_unichar_to_utf8(
217 NULL); /* outbuf=NULL => just the length will be computed */
218 g_assert(utf8len>=0);
221 g_assert(*cwcharp==0);
222 r++; /* '\0'-termination */
228 /* transfer 'string_UnicodeString' to memory in 'mem' as utf8 w/o any further allocations */
229 void _captive_UnicodeString_to_utf8_alloca_internal_fill(gchar *mem,const UNICODE_STRING *string_UnicodeString)
231 const WCHAR *cwcharp;
232 #ifndef G_DISABLE_ASSERT
234 #endif /* G_DISABLE_ASSERT */
236 g_return_if_fail(mem!=NULL);
237 if (!captive_validate_UnicodeString(string_UnicodeString)) {
239 g_return_if_reached();
242 /* We can't use any glib string conversions as UNICODE_STRING uses ucs2! */
243 /* We can't use any glib string conversions as we need to write the string
244 * to our supplied memory storage but glib always g_malloc()s it
246 /* copy 'string_UnicodeString->Buffer' to 'mem' */
247 for (cwcharp=string_UnicodeString->Buffer;*cwcharp;cwcharp++) {
250 utf8len=g_unichar_to_utf8(
251 (gunichar)*cwcharp, /* c */
253 g_assert(utf8len>=0);
258 g_assert((size_t)((mem+1)-mem_orig) == _captive_UnicodeString_to_utf8_alloca_internal_sizeof(string_UnicodeString));
259 g_assert(captive_validate_utf8(mem_orig));
264 * captive_UnicodeString_to_utf8_malloc:
265 * @string_UnicodeString: #PUNICODE_STRING type of string to convert.
267 * g_malloc()-based conversion from #PUNICODE_STRING to plain #utf8 string.
268 * You must free the result with g_free() function.
270 * Returns: #const #gchar * g_malloc()ed converted string @string_UnicodeString.
272 gchar *captive_UnicodeString_to_utf8_malloc(const UNICODE_STRING *string_UnicodeString)
275 #ifndef FUNCMALLOC_FROM_ALLOCA
276 glong utf16_read,utf8_written;
278 #endif /* !FUNCMALLOC_FROM_ALLOCA */
280 g_return_val_if_fail(captive_validate_UnicodeString(string_UnicodeString),g_strdup(""));
282 #ifdef FUNCMALLOC_FROM_ALLOCA
284 r=g_malloc(_captive_UnicodeString_to_utf8_alloca_internal_sizeof(string_UnicodeString));
285 _captive_UnicodeString_to_utf8_alloca_internal_fill(r,string_UnicodeString);
289 err=NULL; /* not precleared by g_utf8_to_utf16()! */
291 (const gunichar2 *)string_UnicodeString->Buffer, /* str */
292 -1, /* len=>'\0'-terminated */
293 &utf16_read, /* items_read; counted in unichar2 (NOT UTF-16 characters or bytes!) */
294 &utf8_written, /* items_written; counted in bytes (NOT UTF-8 characters!) */
297 g_warning("%s: utf16_read=%ld,utf8_written=%ld: %s",G_STRLOC,
298 (long)utf16_read,(long)utf8_written,err->message);
301 g_return_val_if_reached(g_strdup(""));
305 g_assert(utf16_read==(glong)(string_UnicodeString->length/sizeof(*string_UnicodeString->Buffer)));
306 g_assert(utf6_written==strlen(r));
308 #endif /* !FUNCMALLOC_FROM_ALLOCA */
310 g_assert(captive_validate_utf8(r));
316 /* detect required memory size for g_alloca() */
317 size_t _captive_utf8_to_UnicodeString_alloca_internal_sizeof(const gchar *string_utf8)
319 g_return_val_if_fail(captive_validate_utf8(string_utf8),1);
321 /* find the value for PUNICODE_STRING->MaximumLength */
323 +sizeof(UNICODE_STRING)
324 +sizeof(WCHAR)*(g_utf8_strlen(string_utf8,
325 -1 /* max; -1 means '\0'-terminated */
326 )+1); /* '\0'-termination */
329 static void terminate_static_UnicodeString(UNICODE_STRING *string_UnicodeString,glong length)
331 /* 'string_UnicodeString' is not yet valid in this point! */
332 g_return_if_fail(string_UnicodeString!=NULL);
333 g_return_if_fail(length>=0);
335 string_UnicodeString->Length=length*sizeof(WCHAR);
336 string_UnicodeString->MaximumLength=(length+1)*sizeof(WCHAR);
337 string_UnicodeString->Buffer[length]=0;
339 g_assert(captive_validate_UnicodeString(string_UnicodeString));
342 /* transfer 'string_UnicodeString' to memory in 'mem' w/o any further allocations */
343 void _captive_utf8_to_UnicodeString_alloca_internal_fill(UNICODE_STRING *mem,const gchar *string_utf8)
347 glong utf8_read,utf16_written;
350 g_return_if_fail(mem!=NULL);
351 mem->Buffer=(PWSTR)(((char *)mem)+sizeof(*mem)); /* for terminate_static_UnicodeString() below */
352 if (!captive_validate_utf8(string_utf8)) {
353 terminate_static_UnicodeString(mem,0);
354 g_return_if_reached();
357 err=NULL; /* not precleared by g_utf8_to_utf16()! */
358 utf16=g_utf8_to_utf16(
359 string_utf8, /* str */
360 -1, /* len=>'\0'-terminated */
361 &utf8_read, /* items_read; counted in bytes (NOT chars!) */
362 &utf16_written, /* items_written; counted in UTF-16 characters (NOT unichar2 or bytes!) */
365 g_warning("%s: utf8_read=%ld,utf16_written=%ld: %s",G_STRLOC,
366 (long)utf8_read,(long)utf16_written,err->message);
368 g_assert(utf16==NULL);
369 terminate_static_UnicodeString(mem,0);
370 g_return_if_reached();
372 g_assert(utf16!=NULL);
374 /* Check for UCS-2 compliance (reject if surrogates inside) */
375 g_assert(captive_validate_ucs2((const captive_ucs2 *)utf16));
377 ucs2=(captive_ucs2 *)utf16;
379 g_assert(utf8_read==(glong)strlen(string_utf8));
380 g_assert(utf16_written==captive_ucs2_strlen(ucs2));
382 /* check of validity of _captive_utf8_to_UnicodeString_alloca_internal_sizeof() result */
383 g_assert((gchar *)(mem->Buffer+(utf16_written+1)) /* +1 => '\0'-termination */
384 == ((gchar *)mem)+_captive_utf8_to_UnicodeString_alloca_internal_sizeof(string_utf8));
386 memcpy(mem->Buffer,ucs2,sizeof(WCHAR)*(utf16_written+1));
388 terminate_static_UnicodeString(mem,utf16_written);
390 g_assert(captive_validate_UnicodeString(mem));
395 * captive_utf8_to_UnicodeString_malloc:
396 * @string_utf8: #const #gchar * string in #utf8 to convert.
398 * g_malloc()-based conversion from plain #utf8 string to #PUNICODE_STRING.
399 * You must free the result with g_free() function.
401 * Returns: #PUNICODE_STRING g_malloc()ed converted string @string_utf8.
403 PUNICODE_STRING captive_utf8_to_UnicodeString_malloc(const gchar *string_utf8)
406 #ifndef FUNCMALLOC_FROM_ALLOCA
408 glong utf8_read,ucs4_written;
410 #endif /* !FUNCMALLOC_FROM_ALLOCA */
412 g_return_val_if_fail(captive_validate_utf8(string_utf8),captive_utf8_to_UnicodeString_malloc(""));
414 #ifdef FUNCMALLOC_FROM_ALLOCA
416 r=g_malloc(_captive_utf8_to_UnicodeString_alloca_internal_sizeof(string_utf8));
417 _captive_utf8_to_UnicodeString_alloca_internal_fill(r,string_utf8);
421 #error "FIXME: NOT IMPLEMENTED"
423 #endif /* !FUNCMALLOC_FROM_ALLOCA */
425 g_assert(captive_validate_UnicodeString(r));
431 /* map: (const gunichar *) -> (const gunichar2 *); UCS-4 -> UTF-16 */
432 static GHashTable *captive_ucs4_to_utf16_hash;
434 static void captive_ucs4_to_utf16_hash_init(void)
436 if (captive_ucs4_to_utf16_hash)
438 captive_ucs4_to_utf16_hash=g_hash_table_new_full(
439 g_direct_hash, /* hash_func */
440 g_direct_equal, /* key_equal_func */
441 (GDestroyNotify)NULL, /* key_destroy_func; we require persistent strings as input */
442 (GDestroyNotify)g_free); /* value_destroy_func; result of g_ucs4_to_utf16() */
446 * captive_ucs4_to_utf16_const:
447 * @string_ucs4: #const #gunichar * type of persistent string to convert.
448 * This string MUST remain readable with the same content forever.
450 * Constant string conversion from 32-bit #wchar_t to 16-bit (possible pairs of) UTF-16.
451 * You may not modify the result in any way.
453 * It is guaranteed to get two different string addresses for two different
454 * input addresses even if the input strings content is the same.
455 * Otherwise we would behave as #GCC option %-fmerge-constants which
456 * results in %C non-conforming behaviour.
458 * FIXME: UTF-16 encoding IS NOT IMPLEMENTED.
460 * See also captive_ucs4_to_ucs2_const().
462 * Returns: #const #gunichar2 * converted string @string_ucs4.
464 const gunichar2 *captive_ucs4_to_utf16_const(const gunichar *string_ucs4)
466 glong ucs4_read,utf16_written;
468 const gunichar2 *r_lookup;
471 g_return_val_if_fail(captive_validate_ucs4(string_ucs4),captive_ucs4_to_utf16_const((const gunichar *)L""));
473 captive_ucs4_to_utf16_hash_init();
475 /* found already existing item in the table */
476 if ((r_lookup=g_hash_table_lookup(captive_ucs4_to_utf16_hash,
477 string_ucs4) /* key */
482 /* Prepare 'r' as UTF-16 */
483 err=NULL; /* not precleared by g_ucs4_to_utf16()! */
485 (const gunichar *)string_ucs4, /* str */
486 -1, /* len; -1 means '\0'-termination */
487 &ucs4_read, /* items_read; counted in chars (==unichars; NOT bytes!) */
488 &utf16_written, /* items_written; counted in gunichar2 (NOT chars or bytes!) */
491 g_warning("%s: ucs4_read=%ld,utf16_written=%ld: %s",G_STRLOC,
492 (long)ucs4_read,(long)utf16_written,err->message);
495 g_return_val_if_reached(captive_ucs4_to_utf16_const((const gunichar *)L""));
498 g_assert(ucs4_read==(glong)wcslen((const wchar_t *)string_ucs4));
499 /* FIXME: We don't have captive_utf16_strlen() */
500 g_assert(utf16_written==(glong)captive_ucs2_strlen((const gunichar2 *)r));
501 /* (ucs4_read==utf16_written) check would discard any double-pair UTF-16 encodings
502 * but this function is designed as UTF-16 compliant.
505 /* store new item to the table */
506 g_hash_table_insert(captive_ucs4_to_utf16_hash,
507 (gpointer)string_ucs4, /* key; de-const */
510 #if 0 /* We don't have captive_validate_utf16() */
511 g_assert(captive_validate_utf16(r));