bootstrap
[captive.git] / src / libcaptive / rtl / unicode.c
1 /* $Id$
2  * Unicode add-ons to reactos ntoskrnl/rtl/unicode.c for libcaptive
3  * Copyright (C) 2002 Jan Kratochvil <project-captive@jankratochvil.net>
4  * 
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; exactly version 2 of June 1991 is required
8  * 
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  * 
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17  */
18
19
20 #include "config.h"
21
22 #include "captive/unicode.h"    /* self */
23 #include "captive/unicode_reactos.h"    /* for captive_ucs2 */
24 #include <glib/gtypes.h>
25 #include <glib/gmessages.h>
26 #include <glib/gunicode.h>
27 #include <glib/gmem.h>
28 #include "reactos/napi/types.h"  /* for PUNICODE_STRING etc. */
29 #include "reactos/unicode.h"
30 #include "captive/macros.h"
31 #include <glib/gstrfuncs.h>
32 #include <wchar.h>      /* for wcslen() */
33 #include <glib/ghash.h>
34 #include <string.h>
35
36
37 /* CONFIG: */
38 /* Use simplified g_malloc() functions as wrappers around g_alloca() ones.
39  */
40 #define FUNCMALLOC_FROM_ALLOCA 1
41
42
43 /* compiler sanity */
44 static gboolean captive_validate_unicode_types(void)
45 {
46         g_return_val_if_fail(4==sizeof(gunichar),FALSE);
47         g_return_val_if_fail(2==sizeof(WCHAR),FALSE);
48         g_return_val_if_fail(1==sizeof(CHAR),FALSE);
49
50         return TRUE;
51 }
52
53
54 /**
55  * captive_validate_ucs4:
56  * @string_ucs4: #const #gunichar * type string to validate.
57  * Invalid string input is forbidden.
58  *
59  * Checks the validity of all 32-bit unicharacters of 0-terminated string.
60  * It is required to have characters complying to g_unichar_validate().
61  *
62  * Returns: %TRUE if the string is valid.
63  */ 
64 gboolean captive_validate_ucs4(const gunichar *string_ucs4)
65 {
66 const gunichar *cs_ucs4;
67
68         g_return_val_if_fail(captive_validate_unicode_types(),FALSE);
69         g_return_val_if_fail(string_ucs4!=NULL,FALSE);
70
71         for (cs_ucs4=string_ucs4;*cs_ucs4;cs_ucs4++)
72                 g_return_val_if_fail(g_unichar_validate(*cs_ucs4),FALSE);
73
74         return TRUE;
75 }
76
77
78 /**
79  * captive_validate_ucs2:
80  * @string_ucs2: #const #captive_ucs2 * type string to validate.
81  * Invalid string input is forbidden.
82  * UTF-16 encoded strings are forbidden.
83  *
84  * Checks the validity of all 16-bit unicharacters of 0-terminated string.
85  * It is required to have characters complying to g_unichar_validate().
86  *
87  * Returns: %TRUE if the string is valid.
88  */ 
89 gboolean captive_validate_ucs2(const captive_ucs2 *string_ucs2)
90 {
91 const captive_ucs2 *cs_ucs2;
92
93         g_return_val_if_fail(captive_validate_unicode_types(),FALSE);
94         g_return_val_if_fail(string_ucs2!=NULL,FALSE);
95
96         /* g_unichar_validate() will reject surrogates (G_UNICODE_SURROGATE) */
97         for (cs_ucs2=string_ucs2;*cs_ucs2;cs_ucs2++)
98                 g_return_val_if_fail(g_unichar_validate(*cs_ucs2),FALSE);
99
100         return TRUE;
101 }
102
103
104 /**
105  * captive_validate_utf8:
106  * @string_utf8: #const #gchar * utf8 type string to validate.
107  * Invalid string input is forbidden.
108  *
109  * Checks the validity of all utf8 of 0-terminated string.
110  * It is required to have characters complying to g_utf8_validate().
111  *
112  * Returns: %TRUE if the string is valid.
113  */ 
114 gboolean captive_validate_utf8(const gchar *string_utf8)
115 {
116         g_return_val_if_fail(captive_validate_unicode_types(),FALSE);
117         g_return_val_if_fail(string_utf8!=NULL,FALSE);
118
119         g_return_val_if_fail(g_utf8_validate(
120                                         string_utf8,    /* str */
121                                         -1,     /* max_len; -1 means '\0'-terminated */
122                                         NULL),  /* end */
123                         FALSE);
124
125         return TRUE;
126 }
127
128
129 /**
130  * captive_ucs2_strlen:
131  * @string_ucs2: String of type #const #gunichar2 * in pure UCS-2
132  * Invalid string input is forbidden. UTF-16 encoded pairs are forbidden.
133  *
134  * Counts the number of characters (=2bytes) in @strings_ucs2.
135  *
136  * Returns: @string_ucs2 length in UCS-2 characters.
137  */
138 glong captive_ucs2_strlen(const captive_ucs2 *string_ucs2)
139 {
140 glong r;
141
142         g_return_val_if_fail(captive_validate_ucs2(string_ucs2),0);
143
144         for (r=0;*string_ucs2;string_ucs2++)
145                 r++;
146
147         return r;
148 }
149
150
151 /**
152  * captive_validate_UnicodeString:
153  * @string_UnicodeString: #PUNICODE_STRING type string to validate.
154  * Invalid string input is forbidden.
155  *
156  * Checks the internal consistency of the given @string_UnicodeString.
157  * It is required to have characters complying to g_unichar_validate().
158  *
159  * Returns: %TRUE if the string is valid.
160  */
161 gboolean captive_validate_UnicodeString(const UNICODE_STRING *string_UnicodeString)
162 {
163         g_return_val_if_fail(captive_validate_unicode_types(),FALSE);
164         g_return_val_if_fail(sizeof(WCHAR)==sizeof(*string_UnicodeString->Buffer),FALSE);
165         g_return_val_if_fail(string_UnicodeString!=NULL,FALSE);
166         g_return_val_if_fail(string_UnicodeString->Length%sizeof(*string_UnicodeString->Buffer)==0,FALSE);
167         g_return_val_if_fail(string_UnicodeString->MaximumLength
168                         >=string_UnicodeString->Length+sizeof(*string_UnicodeString->Buffer),FALSE);
169         g_return_val_if_fail(string_UnicodeString->Length==sizeof(*string_UnicodeString->Buffer)*
170                         captive_ucs2_strlen(string_UnicodeString->Buffer)
171                         ,FALSE);
172
173         g_return_val_if_fail(captive_validate_ucs2(string_UnicodeString->Buffer),FALSE);
174
175         return TRUE;
176 }
177
178
179 /**
180  * captive_validate_AnsiString:
181  * @string_AnsiString: #PANSI_STRING type string to validate.
182  * Invalid string input is forbidden.
183  *
184  * Checks the internal consistency of the given @string_AnsiString.
185  *
186  * Returns: %TRUE if the string is valid.
187  */
188 gboolean captive_validate_AnsiString(const ANSI_STRING *string_AnsiString)
189 {
190         g_return_val_if_fail(captive_validate_unicode_types(),FALSE);
191         g_return_val_if_fail(sizeof(CHAR)==sizeof(*string_AnsiString->Buffer),FALSE);
192         g_return_val_if_fail(string_AnsiString!=NULL,FALSE);
193         g_return_val_if_fail(string_AnsiString->MaximumLength>=string_AnsiString->Length+1,FALSE);
194         g_return_val_if_fail(string_AnsiString->Length==strlen(string_AnsiString->Buffer),FALSE);
195
196         return TRUE;
197 }
198
199
200 /* detect required memory size for g_alloca() */
201 size_t _captive_UnicodeString_to_utf8_alloca_internal_sizeof(const UNICODE_STRING *string_UnicodeString)
202 {
203 glong length;
204 size_t r;
205 const WCHAR *cwcharp;
206
207         g_return_val_if_fail(captive_validate_UnicodeString(string_UnicodeString),1);
208
209         /* measure 'string_UnicodeString->Buffer' length in UTF-8 to 'r' */
210         cwcharp=string_UnicodeString->Buffer;
211         r=0;
212         for (length=string_UnicodeString->Length/sizeof(*string_UnicodeString->Buffer);length;length--) {
213 gint utf8len;
214
215                 utf8len=g_unichar_to_utf8(
216                                 *cwcharp++,     /* c */
217                                 NULL);  /* outbuf=NULL => just the length will be computed */
218                 g_assert(utf8len>=0);
219                 r+=utf8len;
220                 }
221         g_assert(*cwcharp==0);
222         r++;    /* '\0'-termination */
223
224         /* utf8 byte-size */
225         return r;
226 }
227
228 /* transfer 'string_UnicodeString' to memory in 'mem' as utf8 w/o any further allocations */
229 void _captive_UnicodeString_to_utf8_alloca_internal_fill(gchar *mem,const UNICODE_STRING *string_UnicodeString)
230 {
231 const WCHAR *cwcharp;
232 #ifndef G_DISABLE_ASSERT
233 gchar *mem_orig=mem;
234 #endif /* G_DISABLE_ASSERT */
235
236         g_return_if_fail(mem!=NULL);
237         if (!captive_validate_UnicodeString(string_UnicodeString)) {
238                 *mem='\0';
239                 g_return_if_reached();
240                 }
241
242         /* We can't use any glib string conversions as UNICODE_STRING uses ucs2! */
243         /* We can't use any glib string conversions as we need to write the string
244          * to our supplied memory storage but glib always g_malloc()s it
245          */
246         /* copy 'string_UnicodeString->Buffer' to 'mem' */
247         for (cwcharp=string_UnicodeString->Buffer;*cwcharp;cwcharp++) {
248 gint utf8len;
249
250                 utf8len=g_unichar_to_utf8(
251                                 (gunichar)*cwcharp,     /* c */
252                                 mem);   /* outbuf */
253                 g_assert(utf8len>=0);
254                 mem+=utf8len;
255                 }
256         *mem='\0';
257
258         g_assert((size_t)((mem+1)-mem_orig) == _captive_UnicodeString_to_utf8_alloca_internal_sizeof(string_UnicodeString));
259         g_assert(captive_validate_utf8(mem_orig));
260 }
261
262
263 /**
264  * captive_UnicodeString_to_utf8_malloc:
265  * @string_UnicodeString: #PUNICODE_STRING type of string to convert.
266  *
267  * g_malloc()-based conversion from #PUNICODE_STRING to plain #utf8 string.
268  * You must free the result with g_free() function.
269  *
270  * Returns: #const #gchar * g_malloc()ed converted string @string_UnicodeString.
271  */
272 gchar *captive_UnicodeString_to_utf8_malloc(const UNICODE_STRING *string_UnicodeString)
273 {
274 gchar *r;
275 #ifndef FUNCMALLOC_FROM_ALLOCA
276 glong utf16_read,utf8_written;
277 GError *err;
278 #endif /* !FUNCMALLOC_FROM_ALLOCA */
279
280         g_return_val_if_fail(captive_validate_UnicodeString(string_UnicodeString),g_strdup(""));
281
282 #ifdef FUNCMALLOC_FROM_ALLOCA
283
284         r=g_malloc(_captive_UnicodeString_to_utf8_alloca_internal_sizeof(string_UnicodeString));
285         _captive_UnicodeString_to_utf8_alloca_internal_fill(r,string_UnicodeString);
286
287 #else
288
289         err=NULL;       /* not precleared by g_utf8_to_utf16()! */
290         r=g_utf16_to_utf8(
291                         (const gunichar2 *)string_UnicodeString->Buffer,        /* str */
292                         -1,     /* len=>'\0'-terminated */
293                         &utf16_read,    /* items_read; counted in unichar2 (NOT UTF-16 characters or bytes!) */
294                         &utf8_written,  /* items_written; counted in bytes (NOT UTF-8 characters!) */
295                         &err);
296         if (err) {
297                 g_warning("%s: utf16_read=%ld,utf8_written=%ld: %s",G_STRLOC,
298                                 (long)utf16_read,(long)utf8_written,err->message);
299                 g_error_free(err);
300                 g_assert(r==NULL);
301                 g_return_val_if_reached(g_strdup(""));
302                 }
303         g_assert(r!=NULL);
304
305         g_assert(utf16_read==(glong)(string_UnicodeString->length/sizeof(*string_UnicodeString->Buffer)));
306         g_assert(utf6_written==strlen(r));
307
308 #endif /* !FUNCMALLOC_FROM_ALLOCA */
309
310         g_assert(captive_validate_utf8(r));
311
312         return r;
313 }
314
315
316 /* detect required memory size for g_alloca() */
317 size_t _captive_utf8_to_UnicodeString_alloca_internal_sizeof(const gchar *string_utf8)
318 {
319         g_return_val_if_fail(captive_validate_utf8(string_utf8),1);
320
321         /* find the value for PUNICODE_STRING->MaximumLength */
322         return 0
323                         +sizeof(UNICODE_STRING)
324                         +sizeof(WCHAR)*(g_utf8_strlen(string_utf8,
325                                         -1      /* max; -1 means '\0'-terminated */
326                                         )+1);   /* '\0'-termination */
327 }
328
329 static void terminate_static_UnicodeString(UNICODE_STRING *string_UnicodeString,glong length)
330 {
331         /* 'string_UnicodeString' is not yet valid in this point! */
332         g_return_if_fail(string_UnicodeString!=NULL);
333         g_return_if_fail(length>=0);
334
335         string_UnicodeString->Length=length*sizeof(WCHAR);
336         string_UnicodeString->MaximumLength=(length+1)*sizeof(WCHAR);
337         string_UnicodeString->Buffer[length]=0;
338
339         g_assert(captive_validate_UnicodeString(string_UnicodeString));
340 }
341
342 /* transfer 'string_UnicodeString' to memory in 'mem' w/o any further allocations */
343 void _captive_utf8_to_UnicodeString_alloca_internal_fill(UNICODE_STRING *mem,const gchar *string_utf8)
344 {
345 gunichar2 *utf16;
346 captive_ucs2 *ucs2;
347 glong utf8_read,utf16_written;
348 GError *err;
349
350         g_return_if_fail(mem!=NULL);
351         mem->Buffer=(PWSTR)(((char *)mem)+sizeof(*mem));        /* for terminate_static_UnicodeString() below */
352         if (!captive_validate_utf8(string_utf8)) {
353                 terminate_static_UnicodeString(mem,0);
354                 g_return_if_reached();
355                 }
356
357         err=NULL;       /* not precleared by g_utf8_to_utf16()! */
358         utf16=g_utf8_to_utf16(
359                         string_utf8,    /* str */
360                         -1,     /* len=>'\0'-terminated */
361                         &utf8_read,     /* items_read; counted in bytes (NOT chars!) */
362                         &utf16_written, /* items_written; counted in UTF-16 characters (NOT unichar2 or bytes!) */
363                         &err);
364         if (err) {
365                 g_warning("%s: utf8_read=%ld,utf16_written=%ld: %s",G_STRLOC,
366                                 (long)utf8_read,(long)utf16_written,err->message);
367                 g_error_free(err);
368                 g_assert(utf16==NULL);
369                 terminate_static_UnicodeString(mem,0);
370                 g_return_if_reached();
371                 }
372         g_assert(utf16!=NULL);
373
374         /* Check for UCS-2 compliance (reject if surrogates inside) */
375         g_assert(captive_validate_ucs2((const captive_ucs2 *)utf16));
376         /* valid UCS-2 */
377         ucs2=(captive_ucs2 *)utf16;
378
379         g_assert(utf8_read==(glong)strlen(string_utf8));
380         g_assert(utf16_written==captive_ucs2_strlen(ucs2));
381
382         /* check of validity of _captive_utf8_to_UnicodeString_alloca_internal_sizeof() result */
383         g_assert((gchar *)(mem->Buffer+(utf16_written+1))       /* +1 => '\0'-termination */
384                         == ((gchar *)mem)+_captive_utf8_to_UnicodeString_alloca_internal_sizeof(string_utf8));
385
386         memcpy(mem->Buffer,ucs2,sizeof(WCHAR)*(utf16_written+1));
387         g_free(ucs2);
388         terminate_static_UnicodeString(mem,utf16_written);
389
390         g_assert(captive_validate_UnicodeString(mem));
391 }
392
393
394 /**
395  * captive_utf8_to_UnicodeString_malloc:
396  * @string_utf8: #const #gchar * string in #utf8 to convert.
397  *
398  * g_malloc()-based conversion from plain #utf8 string to #PUNICODE_STRING.
399  * You must free the result with g_free() function.
400  *
401  * Returns: #PUNICODE_STRING g_malloc()ed converted string @string_utf8.
402  */
403 PUNICODE_STRING captive_utf8_to_UnicodeString_malloc(const gchar *string_utf8)
404 {
405 UNICODE_STRING *r;
406 #ifndef FUNCMALLOC_FROM_ALLOCA
407 gunichar *ucs4;
408 glong utf8_read,ucs4_written;
409 GError *err;
410 #endif /* !FUNCMALLOC_FROM_ALLOCA */
411
412         g_return_val_if_fail(captive_validate_utf8(string_utf8),captive_utf8_to_UnicodeString_malloc(""));
413
414 #ifdef FUNCMALLOC_FROM_ALLOCA
415
416         r=g_malloc(_captive_utf8_to_UnicodeString_alloca_internal_sizeof(string_utf8));
417         _captive_utf8_to_UnicodeString_alloca_internal_fill(r,string_utf8);
418
419 #else
420
421 #error "FIXME: NOT IMPLEMENTED"
422
423 #endif /* !FUNCMALLOC_FROM_ALLOCA */
424
425         g_assert(captive_validate_UnicodeString(r));
426
427         return r;
428 }
429
430
431 /* map: (const gunichar *) -> (const gunichar2 *); UCS-4 -> UTF-16 */
432 static GHashTable *captive_ucs4_to_utf16_hash;
433
434 static void captive_ucs4_to_utf16_hash_init(void)
435 {
436         if (captive_ucs4_to_utf16_hash)
437                 return;
438         captive_ucs4_to_utf16_hash=g_hash_table_new_full(
439                         g_direct_hash,  /* hash_func */
440                         g_direct_equal, /* key_equal_func */
441                         (GDestroyNotify)NULL,   /* key_destroy_func; we require persistent strings as input */
442                         (GDestroyNotify)g_free);        /* value_destroy_func; result of g_ucs4_to_utf16() */
443 }
444
445 /**
446  * captive_ucs4_to_utf16_const:
447  * @string_ucs4: #const #gunichar * type of persistent string to convert.
448  * This string MUST remain readable with the same content forever.
449  *
450  * Constant string conversion from 32-bit #wchar_t to 16-bit (possible pairs of) UTF-16.
451  * You may not modify the result in any way.
452  * 
453  * It is guaranteed to get two different string addresses for two different
454  * input addresses even if the input strings content is the same.
455  * Otherwise we would behave as #GCC option %-fmerge-constants which
456  * results in %C non-conforming behaviour.
457  *
458  * FIXME: UTF-16 encoding IS NOT IMPLEMENTED.
459  *
460  * See also captive_ucs4_to_ucs2_const().
461  *
462  * Returns: #const #gunichar2 * converted string @string_ucs4.
463  */
464 const gunichar2 *captive_ucs4_to_utf16_const(const gunichar *string_ucs4)
465 {
466 glong ucs4_read,utf16_written;
467 GError *err;
468 const gunichar2 *r_lookup;
469 gunichar2 *r;
470
471         g_return_val_if_fail(captive_validate_ucs4(string_ucs4),captive_ucs4_to_utf16_const((const gunichar *)L""));
472
473         captive_ucs4_to_utf16_hash_init();
474
475         /* found already existing item in the table */
476         if ((r_lookup=g_hash_table_lookup(captive_ucs4_to_utf16_hash,
477                         string_ucs4)    /* key */
478                         )) {
479                 return r_lookup;
480                 }
481
482         /* Prepare 'r' as UTF-16 */
483         err=NULL;       /* not precleared by g_ucs4_to_utf16()! */
484         r=g_ucs4_to_utf16(
485                         (const gunichar *)string_ucs4,  /* str */
486                         -1,     /* len; -1 means '\0'-termination */
487                         &ucs4_read,     /* items_read; counted in chars (==unichars; NOT bytes!) */
488                         &utf16_written, /* items_written; counted in gunichar2 (NOT chars or bytes!) */
489                         &err);
490         if (err) {
491                 g_warning("%s: ucs4_read=%ld,utf16_written=%ld: %s",G_STRLOC,
492                                 (long)ucs4_read,(long)utf16_written,err->message);
493                 g_error_free(err);
494                 g_assert(r==NULL);
495                 g_return_val_if_reached(captive_ucs4_to_utf16_const((const gunichar *)L""));
496                 }
497         g_assert(r!=NULL);
498         g_assert(ucs4_read==(glong)wcslen((const wchar_t *)string_ucs4));
499         /* FIXME: We don't have captive_utf16_strlen() */
500         g_assert(utf16_written==(glong)captive_ucs2_strlen((const gunichar2 *)r));
501         /* (ucs4_read==utf16_written) check would discard any double-pair UTF-16 encodings
502          * but this function is designed as UTF-16 compliant.
503          */
504
505         /* store new item to the table */
506         g_hash_table_insert(captive_ucs4_to_utf16_hash,
507                         (gpointer)string_ucs4,  /* key; de-const */
508                         r);     /* value */
509
510 #if 0   /* We don't have captive_validate_utf16() */
511         g_assert(captive_validate_utf16(r));
512 #endif
513
514         return r;
515 }