/* $Id$ * Unicode add-ons to reactos ntoskrnl/rtl/unicode.c for libcaptive * Copyright (C) 2002 Jan Kratochvil * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; exactly version 2 of June 1991 is required * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "config.h" #include "captive/unicode.h" /* self */ #include "captive/unicode_reactos.h" /* for captive_ucs2 */ #include #include #include #include #include "reactos/napi/types.h" /* for PUNICODE_STRING etc. */ #include "reactos/unicode.h" #include "captive/macros.h" #include #include /* for wcslen() */ #include #include /* CONFIG: */ /* Use simplified g_malloc() functions as wrappers around g_alloca() ones. */ #define FUNCMALLOC_FROM_ALLOCA 1 /* compiler sanity */ static gboolean captive_validate_unicode_types(void) { g_return_val_if_fail(4==sizeof(gunichar),FALSE); g_return_val_if_fail(2==sizeof(WCHAR),FALSE); g_return_val_if_fail(1==sizeof(CHAR),FALSE); return TRUE; } /** * captive_validate_ucs4: * @string_ucs4: #const #gunichar * type string to validate. * Invalid string input is forbidden. * * Checks the validity of all 32-bit unicharacters of 0-terminated string. * It is required to have characters complying to g_unichar_validate(). * * Returns: %TRUE if the string is valid. */ gboolean captive_validate_ucs4(const gunichar *string_ucs4) { const gunichar *cs_ucs4; g_return_val_if_fail(captive_validate_unicode_types(),FALSE); g_return_val_if_fail(string_ucs4!=NULL,FALSE); for (cs_ucs4=string_ucs4;*cs_ucs4;cs_ucs4++) g_return_val_if_fail(g_unichar_validate(*cs_ucs4),FALSE); return TRUE; } /** * captive_validate_ucs2_fixlen: * @string_ucs2: #const #captive_ucs2 * type string to validate. * Invalid string input is forbidden. * UTF-16 encoded strings are forbidden. * @string_ucs2_fixlen: Number of characters from @string_ucs2 to check. * captive_ucs2_strlen(@string_ucs2)>=@string_ucs2_fixlen is required. * Negative value is forbidden. * * Checks the validity of first @string_ucs2_fixlen 16-bit unicharacters of @string_ucs2. * It is required to have characters complying to g_unichar_validate(). * String length must be equal or larger than @string_ucs2_fixlen; * * Returns: %TRUE if the string is valid. */ gboolean captive_validate_ucs2_fixlen(const captive_ucs2 *string_ucs2,glong string_ucs2_fixlen) { const captive_ucs2 *cs_ucs2; g_return_val_if_fail(captive_validate_unicode_types(),FALSE); g_return_val_if_fail(string_ucs2!=NULL,FALSE); g_return_val_if_fail(string_ucs2_fixlen>=0,FALSE); /* g_unichar_validate() will reject surrogates (G_UNICODE_SURROGATE) */ for (cs_ucs2=string_ucs2;cs_ucs2Buffer),FALSE); g_return_val_if_fail(string_UnicodeString!=NULL,FALSE); g_return_val_if_fail(string_UnicodeString->Length%sizeof(*string_UnicodeString->Buffer)==0,FALSE); g_return_val_if_fail(string_UnicodeString->MaximumLength >=string_UnicodeString->Length+sizeof(*string_UnicodeString->Buffer),FALSE); g_return_val_if_fail(string_UnicodeString->Length==sizeof(*string_UnicodeString->Buffer)* captive_ucs2_strlen(string_UnicodeString->Buffer) ,FALSE); g_return_val_if_fail(captive_validate_ucs2(string_UnicodeString->Buffer),FALSE); return TRUE; } /** * captive_validate_UnicodeString_noterm: * @string_UnicodeString_noterm: #PUNICODE_STRING type string to validate. * Invalid string input is forbidden. * * Checks the internal consistency of the given @string_UnicodeString. * It is required to have characters complying to g_unichar_validate(). * @string_UnicodeString_noterm does not neet to be zero-terminated. * * Returns: %TRUE if the string is valid. */ gboolean captive_validate_UnicodeString_noterm(const UNICODE_STRING *string_UnicodeString_noterm) { const WCHAR *cwp; g_return_val_if_fail(captive_validate_unicode_types(),FALSE); g_return_val_if_fail(sizeof(WCHAR)==sizeof(*string_UnicodeString_noterm->Buffer),FALSE); g_return_val_if_fail(string_UnicodeString_noterm!=NULL,FALSE); g_return_val_if_fail(string_UnicodeString_noterm->Length%sizeof(*string_UnicodeString_noterm->Buffer)==0,FALSE); g_return_val_if_fail(string_UnicodeString_noterm->MaximumLength>=string_UnicodeString_noterm->Length,FALSE); for ( cwp=string_UnicodeString_noterm->Buffer; cwpBuffer +(string_UnicodeString_noterm->Length/sizeof(*string_UnicodeString_noterm->Buffer)); cwp++) g_return_val_if_fail(*cwp!=0,FALSE); g_return_val_if_fail(captive_validate_ucs2_fixlen(string_UnicodeString_noterm->Buffer, string_UnicodeString_noterm->Length/sizeof(*string_UnicodeString_noterm->Buffer)), FALSE); return TRUE; } /** * captive_validate_AnsiString: * @string_AnsiString: #PANSI_STRING type string to validate. * Invalid string input is forbidden. * * Checks the internal consistency of the given @string_AnsiString. * * Returns: %TRUE if the string is valid. */ gboolean captive_validate_AnsiString(const ANSI_STRING *string_AnsiString) { g_return_val_if_fail(captive_validate_unicode_types(),FALSE); g_return_val_if_fail(sizeof(CHAR)==sizeof(*string_AnsiString->Buffer),FALSE); g_return_val_if_fail(string_AnsiString!=NULL,FALSE); g_return_val_if_fail(string_AnsiString->MaximumLength>=string_AnsiString->Length+1,FALSE); g_return_val_if_fail(string_AnsiString->Length==strlen(string_AnsiString->Buffer),FALSE); return TRUE; } /** * captive_ucs2_compare: * @string_a_ucs2: First string of type #const #gunichar2 * in pure UCS-2. * Invalid string input is forbidden. UTF-16 encoded pairs are forbidden. * @string_b_ucs2: Second string of type #const #gunichar2 * in pure UCS-2. * Invalid string input is forbidden. UTF-16 encoded pairs are forbidden. * * Compares case-sensitively @string_a_ucs2 and @string_b_ucs2. * * Returns: %TRUE if @string_a_ucs2 and @string_b_ucs2 are the same. */ gboolean captive_ucs2_compare(const captive_ucs2 *string_a_ucs2,const captive_ucs2 *string_b_ucs2) { guint ui; g_return_val_if_fail(captive_validate_ucs2(string_a_ucs2),FALSE); g_return_val_if_fail(captive_validate_ucs2(string_b_ucs2),FALSE); ui=0; do { if (string_a_ucs2[ui]!=string_b_ucs2[ui]) return FALSE; } while (string_a_ucs2[ui++]); return TRUE; } /** * captive_UnicodeString_compare: * @string_a_UnicodeString: First string of type #PUNICODE_STRING. * Invalid string input is forbidden. * @string_b_UnicodeString: Second string of type #PUNICODE_STRING. * Invalid string input is forbidden. * * Compares case-sensitively @string_a_UnicodeString and @string_b_UnicodeString. * * Returns: %TRUE if @string_a_UnicodeString and @string_b_UnicodeString are the same. */ gboolean captive_UnicodeString_compare (const UNICODE_STRING *string_a_UnicodeString,const UNICODE_STRING *string_b_UnicodeString) { g_return_val_if_fail(captive_validate_UnicodeString(string_a_UnicodeString),FALSE); g_return_val_if_fail(captive_validate_UnicodeString(string_b_UnicodeString),FALSE); if (string_a_UnicodeString->Length!=string_b_UnicodeString->Length) return FALSE; return captive_ucs2_compare(string_a_UnicodeString->Buffer,string_b_UnicodeString->Buffer); } /** * captive_ucs2_compare_insensitive: * @string_a_ucs2: First string of type #const #gunichar2 * in pure UCS-2. * Invalid string input is forbidden. UTF-16 encoded pairs are forbidden. * @string_b_ucs2: Second string of type #const #gunichar2 * in pure UCS-2. * Invalid string input is forbidden. UTF-16 encoded pairs are forbidden. * * Compares case-insensitively @string_a_ucs2 and @string_b_ucs2. * * Returns: %TRUE if @string_a_ucs2 and @string_b_ucs2 are the same. */ gboolean captive_ucs2_compare_insensitive(const captive_ucs2 *string_a_ucs2,const captive_ucs2 *string_b_ucs2) { guint ui; g_return_val_if_fail(captive_validate_ucs2(string_a_ucs2),FALSE); g_return_val_if_fail(captive_validate_ucs2(string_b_ucs2),FALSE); ui=0; do { if (g_unichar_toupper(string_a_ucs2[ui])!=g_unichar_toupper(string_b_ucs2[ui])) return FALSE; } while (string_a_ucs2[ui++]); return TRUE; } /** * captive_UnicodeString_compare_insensitive: * @string_a_UnicodeString: First string of type #PUNICODE_STRING. * Invalid string input is forbidden. * @string_b_UnicodeString: Second string of type #PUNICODE_STRING. * Invalid string input is forbidden. * * Compares case-insensitively @string_a_UnicodeString and @string_b_UnicodeString. * * Returns: %TRUE if @string_a_UnicodeString and @string_b_UnicodeString are the same. */ gboolean captive_UnicodeString_compare_insensitive (const UNICODE_STRING *string_a_UnicodeString,const UNICODE_STRING *string_b_UnicodeString) { g_return_val_if_fail(captive_validate_UnicodeString(string_a_UnicodeString),FALSE); g_return_val_if_fail(captive_validate_UnicodeString(string_b_UnicodeString),FALSE); if (string_a_UnicodeString->Length!=string_b_UnicodeString->Length) return FALSE; return captive_ucs2_compare_insensitive(string_a_UnicodeString->Buffer,string_b_UnicodeString->Buffer); } /* detect required memory size for g_alloca() */ size_t _captive_UnicodeString_to_utf8_alloca_internal_sizeof(const UNICODE_STRING *string_UnicodeString) { glong length; size_t r; const WCHAR *cwcharp; g_return_val_if_fail(captive_validate_UnicodeString(string_UnicodeString),1); /* measure 'string_UnicodeString->Buffer' length in UTF-8 to 'r' */ cwcharp=string_UnicodeString->Buffer; r=0; for (length=string_UnicodeString->Length/sizeof(*string_UnicodeString->Buffer);length;length--) { gint utf8len; utf8len=g_unichar_to_utf8( *cwcharp++, /* c */ NULL); /* outbuf=NULL => just the length will be computed */ g_assert(utf8len>=0); r+=utf8len; } g_assert(*cwcharp==0); r++; /* '\0'-termination */ /* utf8 byte-size */ return r; } /* transfer 'string_UnicodeString' to memory in 'mem' as utf8 w/o any further allocations */ void _captive_UnicodeString_to_utf8_alloca_internal_fill(gchar *mem,const UNICODE_STRING *string_UnicodeString) { const WCHAR *cwcharp; #ifndef G_DISABLE_ASSERT gchar *mem_orig=mem; #endif /* G_DISABLE_ASSERT */ g_return_if_fail(mem!=NULL); if (!captive_validate_UnicodeString(string_UnicodeString)) { *mem='\0'; g_return_if_reached(); } /* We can't use any glib string conversions as UNICODE_STRING uses ucs2! */ /* We can't use any glib string conversions as we need to write the string * to our supplied memory storage but glib always g_malloc()s it */ /* copy 'string_UnicodeString->Buffer' to 'mem' */ for (cwcharp=string_UnicodeString->Buffer;*cwcharp;cwcharp++) { gint utf8len; utf8len=g_unichar_to_utf8( (gunichar)*cwcharp, /* c */ mem); /* outbuf */ g_assert(utf8len>=0); mem+=utf8len; } *mem='\0'; g_assert((size_t)((mem+1)-mem_orig) == _captive_UnicodeString_to_utf8_alloca_internal_sizeof(string_UnicodeString)); g_assert(captive_validate_utf8(mem_orig)); } /** * captive_UnicodeString_to_utf8_malloc: * @string_UnicodeString: #PUNICODE_STRING type of string to convert. * * g_malloc()-based conversion from #PUNICODE_STRING to plain #utf8 string. * You must free the result with g_free() function. * * Returns: #const #gchar * g_malloc()ed converted string @string_UnicodeString. */ gchar *captive_UnicodeString_to_utf8_malloc(const UNICODE_STRING *string_UnicodeString) { gchar *r; #ifndef FUNCMALLOC_FROM_ALLOCA glong utf16_read,utf8_written; GError *err; #endif /* !FUNCMALLOC_FROM_ALLOCA */ g_return_val_if_fail(captive_validate_UnicodeString(string_UnicodeString),g_strdup("")); #ifdef FUNCMALLOC_FROM_ALLOCA r=g_malloc(_captive_UnicodeString_to_utf8_alloca_internal_sizeof(string_UnicodeString)); _captive_UnicodeString_to_utf8_alloca_internal_fill(r,string_UnicodeString); #else err=NULL; /* not precleared by g_utf8_to_utf16()! */ r=g_utf16_to_utf8( (const gunichar2 *)string_UnicodeString->Buffer, /* str */ -1, /* len=>'\0'-terminated */ &utf16_read, /* items_read; counted in unichar2 (NOT UTF-16 characters or bytes!) */ &utf8_written, /* items_written; counted in bytes (NOT UTF-8 characters!) */ &err); if (err) { g_warning("%s: utf16_read=%ld,utf8_written=%ld: %s",G_STRLOC, (long)utf16_read,(long)utf8_written,err->message); g_error_free(err); g_assert(r==NULL); g_return_val_if_reached(g_strdup("")); } g_assert(r!=NULL); g_assert(utf16_read==(glong)(string_UnicodeString->length/sizeof(*string_UnicodeString->Buffer))); g_assert(utf6_written==strlen(r)); #endif /* !FUNCMALLOC_FROM_ALLOCA */ g_assert(captive_validate_utf8(r)); return r; } /* detect required memory size for g_alloca() */ size_t _captive_utf8_to_UnicodeString_alloca_internal_sizeof(const gchar *string_utf8) { g_return_val_if_fail(captive_validate_utf8(string_utf8),1); /* find the value for PUNICODE_STRING->MaximumLength */ return 0 +sizeof(UNICODE_STRING) +sizeof(WCHAR)*(g_utf8_strlen(string_utf8, -1 /* max; -1 means '\0'-terminated */ )+1); /* '\0'-termination */ } static void terminate_static_UnicodeString(UNICODE_STRING *string_UnicodeString,glong length) { /* 'string_UnicodeString' is not yet valid in this point! */ g_return_if_fail(string_UnicodeString!=NULL); g_return_if_fail(length>=0); string_UnicodeString->Length=length*sizeof(WCHAR); string_UnicodeString->MaximumLength=(length+1)*sizeof(WCHAR); string_UnicodeString->Buffer[length]=0; g_assert(captive_validate_UnicodeString(string_UnicodeString)); } /* transfer 'string_UnicodeString' to memory in 'mem' w/o any further allocations */ void _captive_utf8_to_UnicodeString_alloca_internal_fill(UNICODE_STRING *mem,const gchar *string_utf8) { gunichar2 *utf16; captive_ucs2 *ucs2; glong utf8_read,utf16_written; GError *err; g_return_if_fail(mem!=NULL); mem->Buffer=(PWSTR)(((char *)mem)+sizeof(*mem)); /* for terminate_static_UnicodeString() below */ if (!captive_validate_utf8(string_utf8)) { terminate_static_UnicodeString(mem,0); g_return_if_reached(); } err=NULL; /* not precleared by g_utf8_to_utf16()! */ utf16=g_utf8_to_utf16( string_utf8, /* str */ -1, /* len=>'\0'-terminated */ &utf8_read, /* items_read; counted in bytes (NOT chars!) */ &utf16_written, /* items_written; counted in UTF-16 characters (NOT unichar2 or bytes!) */ &err); if (err) { g_warning("%s: utf8_read=%ld,utf16_written=%ld: %s",G_STRLOC, (long)utf8_read,(long)utf16_written,err->message); g_error_free(err); g_assert(utf16==NULL); terminate_static_UnicodeString(mem,0); g_return_if_reached(); } g_assert(utf16!=NULL); /* Check for UCS-2 compliance (reject if surrogates inside) */ g_assert(captive_validate_ucs2((const captive_ucs2 *)utf16)); /* valid UCS-2 */ ucs2=(captive_ucs2 *)utf16; g_assert(utf8_read==(glong)strlen(string_utf8)); g_assert(utf16_written==captive_ucs2_strlen(ucs2)); /* check of validity of _captive_utf8_to_UnicodeString_alloca_internal_sizeof() result */ g_assert((gchar *)(mem->Buffer+(utf16_written+1)) /* +1 => '\0'-termination */ == ((gchar *)mem)+_captive_utf8_to_UnicodeString_alloca_internal_sizeof(string_utf8)); memcpy(mem->Buffer,ucs2,sizeof(WCHAR)*(utf16_written+1)); g_free(ucs2); terminate_static_UnicodeString(mem,utf16_written); g_assert(captive_validate_UnicodeString(mem)); } /** * captive_utf8_to_UnicodeString_malloc: * @string_utf8: #const #gchar * string in #utf8 to convert. * * g_malloc()-based conversion from plain #utf8 string to #PUNICODE_STRING. * You must free the result with g_free() function. * * Returns: #PUNICODE_STRING g_malloc()ed converted string @string_utf8. */ PUNICODE_STRING captive_utf8_to_UnicodeString_malloc(const gchar *string_utf8) { UNICODE_STRING *r; #ifndef FUNCMALLOC_FROM_ALLOCA gunichar *ucs4; glong utf8_read,ucs4_written; GError *err; #endif /* !FUNCMALLOC_FROM_ALLOCA */ g_return_val_if_fail(captive_validate_utf8(string_utf8),captive_utf8_to_UnicodeString_malloc("")); #ifdef FUNCMALLOC_FROM_ALLOCA r=g_malloc(_captive_utf8_to_UnicodeString_alloca_internal_sizeof(string_utf8)); _captive_utf8_to_UnicodeString_alloca_internal_fill(r,string_utf8); #else #error "FIXME: NOT IMPLEMENTED" #endif /* !FUNCMALLOC_FROM_ALLOCA */ g_assert(captive_validate_UnicodeString(r)); return r; } /* map: (const gunichar *) -> (const gunichar2 *); UCS-4 -> UTF-16 */ static GHashTable *captive_ucs4_to_utf16_hash; static void captive_ucs4_to_utf16_hash_init(void) { if (captive_ucs4_to_utf16_hash) return; captive_ucs4_to_utf16_hash=g_hash_table_new_full( g_direct_hash, /* hash_func */ g_direct_equal, /* key_equal_func */ (GDestroyNotify)NULL, /* key_destroy_func; we require persistent strings as input */ (GDestroyNotify)g_free); /* value_destroy_func; result of g_ucs4_to_utf16() */ } /** * captive_ucs4_to_utf16_const: * @string_ucs4: #const #gunichar * type of persistent string to convert. * This string MUST remain readable with the same content forever. * * Constant string conversion from 32-bit #wchar_t to 16-bit (possible pairs of) UTF-16. * You may not modify the result in any way. * * It is guaranteed to get two different string addresses for two different * input addresses even if the input strings content is the same. * Otherwise we would behave as #GCC option %-fmerge-constants which * results in %C non-conforming behaviour. * * FIXME: UTF-16 encoding IS NOT IMPLEMENTED. * * See also captive_ucs4_to_ucs2_const(). * * Returns: #const #gunichar2 * converted string @string_ucs4. */ const gunichar2 *captive_ucs4_to_utf16_const(const gunichar *string_ucs4) { glong ucs4_read,utf16_written; GError *err; const gunichar2 *r_lookup; gunichar2 *r; g_return_val_if_fail(captive_validate_ucs4(string_ucs4),captive_ucs4_to_utf16_const((const gunichar *)L"")); captive_ucs4_to_utf16_hash_init(); /* found already existing item in the table */ if ((r_lookup=g_hash_table_lookup(captive_ucs4_to_utf16_hash, string_ucs4) /* key */ )) { return r_lookup; } /* Prepare 'r' as UTF-16 */ err=NULL; /* not precleared by g_ucs4_to_utf16()! */ r=g_ucs4_to_utf16( (const gunichar *)string_ucs4, /* str */ -1, /* len; -1 means '\0'-termination */ &ucs4_read, /* items_read; counted in chars (==unichars; NOT bytes!) */ &utf16_written, /* items_written; counted in gunichar2 (NOT chars or bytes!) */ &err); if (err) { g_warning("%s: ucs4_read=%ld,utf16_written=%ld: %s",G_STRLOC, (long)ucs4_read,(long)utf16_written,err->message); g_error_free(err); g_assert(r==NULL); g_return_val_if_reached(captive_ucs4_to_utf16_const((const gunichar *)L"")); } g_assert(r!=NULL); g_assert(ucs4_read==(glong)wcslen((const wchar_t *)string_ucs4)); /* FIXME: We don't have captive_utf16_strlen() */ g_assert(utf16_written==(glong)captive_ucs2_strlen((const gunichar2 *)r)); /* (ucs4_read==utf16_written) check would discard any double-pair UTF-16 encodings * but this function is designed as UTF-16 compliant. */ /* store new item to the table */ g_hash_table_insert(captive_ucs4_to_utf16_hash, (gpointer)string_ucs4, /* key; de-const */ r); /* value */ #if 0 /* We don't have captive_validate_utf16() */ g_assert(captive_validate_utf16(r)); #endif return r; }