libntfs/unistr.c

   1 /*
   2  * unistr.c - Unicode string handling. Part of the Linux-NTFS project.
   3  *
   4  * Copyright (c) 2000-2002 Anton Altaparmakov
   5  *
   6  * This program/include file is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License as published
   8  * by the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program/include file is distributed in the hope that it will be
  12  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
  13  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program (in the main directory of the Linux-NTFS
  18  * distribution in the file COPYING); if not, write to the Free Software
  19  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  20  */
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <wchar.h>
  25 #include <string.h>
  26 #include <errno.h>
  27
  28 #include "types.h"
  29 #include "unistr.h"
  30 #include "debug.h"
  31
  32 /*
  33  * IMPORTANT
  34  * =========
  35  *
  36  * All these routines assume that the Unicode characters are in little endian
  37  * encoding inside the strings!!!
  38  */
  39
  40 /*
  41  * This is used by the name collation functions to quickly determine what
  42  * characters are (in)valid.
  43  */
  44 const u8 legal_ansi_char_array[0x40] = {
  45         0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
  46         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
  47
  48         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
  49         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
  50
  51         0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
  52         0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
  53
  54         0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
  55         0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
  56 };
  57
  58 /**
  59  * ntfs_names_are_equal - compare two Unicode names for equality
  60  * @s1:                 name to compare to @s2
  61  * @s1_len:             length in Unicode characters of @s1
  62  * @s2:                 name to compare to @s1
  63  * @s2_len:             length in Unicode characters of @s2
  64  * @ic:                 ignore case bool
  65  * @upcase:             upcase table (only if @ic == IGNORE_CASE)
  66  * @upcase_size:        length in Unicode characters of @upcase (if present)
  67  *
  68  * Compare the names @s1 and @s2 and return TRUE (1) if the names are
  69  * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE,
  70  * the @upcase table is used to performa a case insensitive comparison.
  71  */
  72 BOOL ntfs_names_are_equal(const uchar_t *s1, size_t s1_len,
  73                 const uchar_t *s2, size_t s2_len,
  74                 const IGNORE_CASE_BOOL ic,
  75                 const uchar_t *upcase, const u32 upcase_size)
  76 {
  77         if (s1_len != s2_len)
  78                 return FALSE;
  79         if (!s1_len)
  80                 return TRUE;
  81         if (ic == CASE_SENSITIVE)
  82                 return ntfs_ucsncmp(s1, s2, s1_len) ? FALSE: TRUE;
  83         return ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size) ? FALSE:
  84                                                                        TRUE;
  85 }
  86
  87 /**
  88  * ntfs_names_collate - collate two Unicode names
  89  * @upcase:     upcase table (ignored if @ic is CASE_SENSITIVE)
  90  * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE)
  91  * @name1:      first Unicode name to compare
  92  * @name2:      second Unicode name to compare
  93  * @ic:         either CASE_SENSITIVE or IGNORE_CASE
  94  * @err_val:    if @name1 contains an invalid character return this value
  95  *
  96  * ntfs_names_collate() collates two Unicode names and returns:
  97  *
  98  *  -1 if the first name collates before the second one,
  99  *   0 if the names match,
 100  *   1 if the second name collates before the first one, or
 101  * @err_val if an invalid character is found in @name1 during the comparison.
 102  *
 103  * The following characters are considered invalid: '"', '*', '<', '>' and '?'.
 104  */
 105 int ntfs_names_collate(const uchar_t *name1, const u32 name1_len,
 106                 const uchar_t *name2, const u32 name2_len,
 107                 const int err_val, const IGNORE_CASE_BOOL ic,
 108                 const uchar_t *upcase, const u32 upcase_len)
 109 {
 110         u32 cnt;
 111         uchar_t c1, c2;
 112
 113 #ifdef DEBUG
 114         if (!name1 || !name2 || (ic && !upcase && upcase_len)) {
 115                 Dputs("ntfs_names_collate received NULL pointer!");
 116                 exit(1);
 117         }
 118 #endif
 119         for (cnt = 0; cnt < min(name1_len, name2_len); ++cnt)
 120         {
 121                 c1 = le16_to_cpu(*name1++);
 122                 c2 = le16_to_cpu(*name2++);
 123                 if (ic) {
 124                         if (c1 < upcase_len)
 125                                 c1 = le16_to_cpu(upcase[c1]);
 126                         if (c2 < upcase_len)
 127                                 c2 = le16_to_cpu(upcase[c2]);
 128                 }
 129                 if (c1 < 64 && legal_ansi_char_array[c1] & 8)
 130                         return err_val;
 131                 if (c1 < c2)
 132                         return -1;
 133                 if (c1 > c2)
 134                         return 1;
 135         }
 136         if (name1_len < name2_len)
 137                 return -1;
 138         if (name1_len == name2_len)
 139                 return 0;
 140         /* name1_len > name2_len */
 141         c1 = le16_to_cpu(*name1);
 142         if (c1 < 64 && legal_ansi_char_array[c1] & 8)
 143                 return err_val;
 144         return 1;
 145 }
 146
 147 /**
 148  * ntfs_ucsncmp - compare two little endian Unicode strings
 149  * @s1:         first string
 150  * @s2:         second string
 151  * @n:          maximum unicode characters to compare
 152  *
 153  * Compare the first @n characters of the Unicode strings @s1 and @s2,
 154  * The strings in little endian format and appropriate le16_to_cpu()
 155  * conversion is performed on non-little endian machines.
 156  *
 157  * The function returns an integer less than, equal to, or greater than zero
 158  * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
 159  * to be less than, to match, or be greater than @s2.
 160  */
 161 int ntfs_ucsncmp(const uchar_t *s1, const uchar_t *s2, size_t n)
 162 {
 163         uchar_t c1, c2;
 164         size_t i;
 165
 166 #ifdef DEBUG
 167         if (!s1 || !s2) {
 168                 Dputs("ntfs_wcsncmp() received NULL pointer!");
 169                 exit(1);
 170         }
 171 #endif
 172         for (i = 0; i < n; ++i) {
 173                 c1 = le16_to_cpu(s1[i]);
 174                 c2 = le16_to_cpu(s2[i]);
 175                 if (c1 < c2)
 176                         return -1;
 177                 if (c1 > c2)
 178                         return 1;
 179                 if (!c1)
 180                         break;
 181         }
 182         return 0;
 183 }
 184
 185 /**
 186  * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
 187  * @s1:                 first string
 188  * @s2:                 second string
 189  * @n:                  maximum unicode characters to compare
 190  * @upcase:             upcase table
 191  * @upcase_size:        upcase table size in Unicode characters
 192  *
 193  * Compare the first @n characters of the Unicode strings @s1 and @s2,
 194  * ignoring case. The strings in little endian format and appropriate
 195  * le16_to_cpu() conversion is performed on non-little endian machines.
 196  *
 197  * Each character is uppercased using the @upcase table before the comparison.
 198  *
 199  * The function returns an integer less than, equal to, or greater than zero
 200  * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
 201  * to be less than, to match, or be greater than @s2.
 202  */
 203 int ntfs_ucsncasecmp(const uchar_t *s1, const uchar_t *s2, size_t n,
 204                 const uchar_t *upcase, const u32 upcase_size)
 205 {
 206         uchar_t c1, c2;
 207         size_t i;
 208
 209 #ifdef DEBUG
 210         if (!s1 || !s2 || !upcase) {
 211                 Dputs("ntfs_wcsncasecmp() received NULL pointer!");
 212                 exit(1);
 213         }
 214 #endif
 215         for (i = 0; i < n; ++i) {
 216                 if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
 217                         c1 = le16_to_cpu(upcase[c1]);
 218                 if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
 219                         c2 = le16_to_cpu(upcase[c2]);
 220                 if (c1 < c2)
 221                         return -1;
 222                 if (c1 > c2)
 223                         return 1;
 224                 if (!c1)
 225                         break;
 226         }
 227         return 0;
 228 }
 229
 230 /**
 231  * ntfs_ucsnlen - determine the length of a little endian Unicode string
 232  * @s:          pointer to Unicode string
 233  * @maxlen:     maximum length of string @s
 234  *
 235  * Return the number of Unicode characters in the little endian Unicode
 236  * string @s up to a maximum of maxlen Unicode characters, not including
 237  * the terminating (uchar_t)'\0'. If there is no (uchar_t)'\0' between @s
 238  * and @s + @maxlen, @maxlen is returned.
 239  *
 240  * This function never looks beyond @s + @maxlen.
 241  */
 242 u32 ntfs_ucsnlen(const uchar_t *s, u32 maxlen)
 243 {
 244         u32 i;
 245
 246         for (i = 0; i < maxlen; i++) {
 247                 if (!le16_to_cpu(s[i]))
 248                         break;
 249         }
 250         return i;
 251 }
 252
 253 /**
 254  * ntfs_name_upcase
 255  */
 256 void ntfs_name_upcase(uchar_t *name, u32 name_len, const uchar_t *upcase,
 257                 const u32 upcase_len)
 258 {
 259         u32 i;
 260         uchar_t u;
 261
 262         for (i = 0; i < name_len; i++)
 263                 if ((u = le16_to_cpu(name[i])) < upcase_len)
 264                         name[i] = upcase[u];
 265 }
 266
 267 /**
 268  * ntfs_file_value_upcase
 269  */
 270 void ntfs_file_value_upcase(FILE_NAME_ATTR *file_name_attr,
 271                 const uchar_t *upcase, const u32 upcase_len)
 272 {
 273         ntfs_name_upcase((uchar_t*)&file_name_attr->file_name,
 274                         file_name_attr->file_name_length, upcase, upcase_len);
 275 }
 276
 277 /**
 278  * ntfs_file_values_compare
 279  */
 280 int ntfs_file_values_compare(FILE_NAME_ATTR *file_name_attr1,
 281                 FILE_NAME_ATTR *file_name_attr2,
 282                 const int err_val, const IGNORE_CASE_BOOL ic,
 283                 const uchar_t *upcase, const u32 upcase_len)
 284 {
 285         return ntfs_names_collate((uchar_t*)&file_name_attr1->file_name,
 286                         file_name_attr1->file_name_length,
 287                         (uchar_t*)&file_name_attr2->file_name,
 288                         file_name_attr2->file_name_length,
 289                         err_val, ic, upcase, upcase_len);
 290 }
 291
 292 /**
 293  * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string
 294  * @ins:        input Unicode string buffer
 295  * @ins_len:    length of input string in Unicode characters
 296  * @outs:       on return contains the (allocated) output multibyte string
 297  * @outs_len:   length of output buffer in bytes
 298  *
 299  * Convert the input little endian, 2-byte Unicode string @ins, of length
 300  * @ins_len into the multibyte string format dictated by the current locale.
 301  *
 302  * If *@outs is NULL, the function allocates the string and the caller is
 303  * responsible for calling free(*@outs); when finished with it.
 304  *
 305  * On success the function returns the number of bytes written to the output
 306  * string *@outs (>= 0), not counting the terminating NULL byte. If the output
 307  * string buffer was allocated, *@outs is set to it.
 308  *
 309  * On error, -1 is returned, and errno is set to the error code. The following
 310  * error codes can be expected:
 311  *      EINVAL          Invalid arguments (e.g. @ins or @outs is NULL).
 312  *      EILSEQ          The input string cannot be represented as a multibyte
 313  *                      sequence according to the current locale.
 314  *      ENAMETOOLONG    Destination buffer is too small for input string.
 315  *      ENOMEM          Not enough memory to allocate destination buffer.
 316  */
 317 int ntfs_ucstombs(const uchar_t *ins, const int ins_len, char **outs,
 318                 int outs_len)
 319 {
 320         char *mbs;
 321         wchar_t wc;
 322         int i, o, mbs_len;
 323         int cnt = 0;
 324         mbstate_t mbstate;
 325
 326         if (!ins || !outs) {
 327                 errno = EINVAL;
 328                 return -1;
 329         }
 330         mbs = *outs;
 331         mbs_len = outs_len;
 332         if (mbs && !mbs_len) {
 333                 errno = ENAMETOOLONG;
 334                 return -1;
 335         }
 336         if (!mbs) {
 337                 mbs_len = (ins_len + 1) * MB_CUR_MAX;
 338                 mbs = (char*)malloc(mbs_len);
 339                 if (!mbs)
 340                         return -1;
 341         }
 342         memset(&mbstate, 0, sizeof(mbstate));
 343         for (i = o = 0; i < ins_len; i++) {
 344                 /* Reallocate memory if necessary or abort. */
 345                 if (o + MB_CUR_MAX > mbs_len) {
 346                         char *tc;
 347                         if (mbs == *outs) {
 348                                 errno = ENAMETOOLONG;
 349                                 return -1;
 350                         }
 351                         tc = (char*)malloc((mbs_len + 64) & ~63);
 352                         if (!tc)
 353                                 goto err_out;
 354                         memcpy(tc, mbs, mbs_len);
 355                         mbs_len = (mbs_len + 64) & ~63;
 356                         free(mbs);
 357                         mbs = tc;
 358                 }
 359                 /* Convert the LE Unicode character to a CPU wide character. */
 360                 wc = (wchar_t)le16_to_cpu(ins[i]);
 361                 if (!wc)
 362                         break;
 363                 /* Convert the CPU endian wide character to multibyte. */
 364                 cnt = wcrtomb(mbs + o, wc, &mbstate);
 365                 if (cnt == -1)
 366                         goto err_out;
 367                 if (cnt <= 0) {
 368                         Dprintf("Eeek. cnt <= 0, cnt = %i\n", cnt);
 369                         errno = EINVAL;
 370                         goto err_out;
 371                 }
 372                 o += cnt;
 373         }
 374         /* Make sure we are back in the initial state. */
 375         if (!mbsinit(&mbstate)) {
 376                 Dputs("Eeek. mbstate not in initial state!");
 377                 errno = EILSEQ;
 378                 goto err_out;
 379         }
 380         /* Now write the NULL character. */
 381         mbs[o] = '\0';
 382         if (*outs != mbs)
 383                 *outs = mbs;
 384         return o;
 385 err_out:
 386         if (mbs != *outs) {
 387                 int eo = errno;
 388                 free(mbs);
 389                 errno = eo;
 390         }
 391         return -1;
 392 }
 393
 394 /**
 395  * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string
 396  * @ins:        input multibyte string buffer
 397  * @outs:       on return contains the (allocated) output Unicode string
 398  * @outs_len:   length of output buffer in Unicode characters
 399  *
 400  * Convert the input multibyte string @ins, from the current locale into the
 401  * corresponding little endian, 2-byte Unicode string.
 402  *
 403  * If *@outs is NULL, the function allocates the string and the caller is
 404  * responsible for calling free(*@outs); when finished with it.
 405  *
 406  * On success the function returns the number of Unicode characters written to
 407  * the output string *@outs (>= 0), not counting the terminating Unicode NULL
 408  * character. If the output string buffer was allocated, *@outs is set to it.
 409  *
 410  * On error, -1 is returned, and errno is set to the error code. The following
 411  * error codes can be expected:
 412  *      EINVAL          Invalid arguments (e.g. @ins or @outs is NULL).
 413  *      EILSEQ          The input string cannot be represented as a Unicode
 414  *                      string according to the current locale.
 415  *      ENAMETOOLONG    Destination buffer is too small for input string.
 416  *      ENOMEM          Not enough memory to allocate destination buffer.
 417  */
 418 int ntfs_mbstoucs(char *ins, uchar_t **outs, int outs_len)
 419 {
 420         uchar_t *ucs;
 421         char *s;
 422         wchar_t wc;
 423         int i, o, cnt, ins_len, ucs_len;
 424         mbstate_t mbstate;
 425
 426         if (!ins || !outs) {
 427                 errno = EINVAL;
 428                 return -1;
 429         }
 430         ucs = *outs;
 431         ucs_len = outs_len;
 432         if (ucs && !ucs_len) {
 433                 errno = ENAMETOOLONG;
 434                 return -1;
 435         }
 436         /* Determine the length of the multi-byte string. */
 437         s = ins;
 438         memset(&mbstate, 0, sizeof(mbstate));
 439         ins_len = mbsrtowcs(NULL, (const char **)&s, 0, &mbstate);
 440         if (ins_len == -1)
 441                 return ins_len;
 442         if ((s != ins) || !mbsinit(&mbstate)) {
 443                 errno = EILSEQ;
 444                 return -1;
 445         }
 446         /* Add the NULL terminator. */
 447         ins_len++;
 448         if (!ucs) {
 449                 ucs_len = ins_len;
 450                 ucs = (uchar_t*)malloc(ucs_len * sizeof(uchar_t));
 451                 if (!ucs)
 452                         return -1;
 453         }
 454         memset(&mbstate, 0, sizeof(mbstate));
 455         for (i = o = cnt = 0; o < ins_len; i += cnt, o++) {
 456                 /* Reallocate memory if necessary or abort. */
 457                 if (o >= ucs_len) {
 458                         uchar_t *tc;
 459                         if (ucs == *outs) {
 460                                 errno = ENAMETOOLONG;
 461                                 return -1;
 462                         }
 463                         /*
 464                          * We will never get here but hey, it's only a bit of
 465                          * extra code...
 466                          */
 467                         ucs_len = (ucs_len * sizeof(uchar_t) + 64) & ~63;
 468                         tc = (uchar_t*)realloc(ucs, ucs_len);
 469                         if (!tc)
 470                                 goto err_out;
 471                         ucs = tc;
 472                         ucs_len /= sizeof(uchar_t);
 473                 }
 474                 /* Convert the multibyte character to a wide character. */
 475                 cnt = mbrtowc(&wc, ins + i, ins_len - i, &mbstate);
 476                 if (!cnt)
 477                         break;
 478                 if (cnt == -1)
 479                         goto err_out;
 480                 if (cnt < -1) {
 481                         Dprintf("%s(): Eeek. cnt = %i\n", __FUNCTION__, cnt);
 482                         errno = EINVAL;
 483                         goto err_out;
 484                 }
 485                 /* Make sure we are not overflowing the NTFS Unicode set. */
 486                 if ((unsigned long)wc >= (unsigned long)(1 <<
 487                                 (8 * sizeof(uchar_t)))) {
 488                         errno = EILSEQ;
 489                         goto err_out;
 490                 }
 491                 /* Convert the CPU wide character to a LE Unicode character. */
 492                 ucs[o] = cpu_to_le16(wc);
 493         }
 494         /* Make sure we are back in the initial state. */
 495         if (!mbsinit(&mbstate)) {
 496                 Dprintf("%s(): Eeek. mbstate not in initial state!\n",
 497                                 __FUNCTION__);
 498                 errno = EILSEQ;
 499                 goto err_out;
 500         }
 501         /* Now write the NULL character. */
 502         ucs[o] = cpu_to_le16(L'\0');
 503         if (*outs != ucs)
 504                 *outs = ucs;
 505         return o;
 506 err_out:
 507         if (ucs != *outs) {
 508                 int eo = errno;
 509                 free(ucs);
 510                 errno = eo;
 511         }
 512         return -1;
 513 }
 514