1 /* -*- Mode: C; indent-tabs-mode: t; c-basic-offset: 8; tab-width: 8 -*- */
4 * gnome-vfs-mime-magic.c
7 * James Youngman (jay@gnu.org)
9 * Adatped to the GNOME needs by:
10 * Elliot Lee (sopwith@cuc.edu)
13 * Pavel Cisler <pavel@eazel.com>
17 #include "gnome-vfs-mime-magic.h"
19 #include <sys/types.h>
21 #include "gnome-vfs-mime-sniff-buffer-private.h"
22 #include "gnome-vfs-mime.h"
23 #include "gnome-vfs-private-utils.h"
29 #include <sys/types.h>
38 #include <glib/garray.h>
39 #include <glib/gmessages.h>
40 #include <glib/gstrfuncs.h>
41 #include <glib/gthread.h>
42 #include <glib/gutils.h>
43 #include <glib/gunicode.h>
45 is_octal_digit (char ch)
47 return ch >= '0' && ch <= '7';
51 is_hex_digit (char ch)
53 if (ch >= '0' && ch <= '9') {
56 if (ch >= 'a' && ch <= 'f') {
60 return (ch >= 'A' && ch <= 'F');
63 /* FIXME bugzilla.eazel.com 2760:
64 * should return error here
67 read_octal_byte (const char **pos)
72 for (count = 0; count < 3; count++) {
73 if (!is_octal_digit (**pos)) {
74 g_error ("bad octal digit %c", **pos);
79 retval += **pos - '0';
86 /* FIXME bugzilla.eazel.com 2760:
87 * should return error here
90 read_hex_byte (const char **pos)
95 for (count = 0; ; count++) {
96 if (!is_hex_digit (**pos)) {
97 g_error ("bad hex digit %c", **pos);
100 if (**pos >= '0' && **pos <= '9') {
101 retval += **pos - '0';
103 retval += g_ascii_tolower (**pos) - 'a' + 10;
116 /* FIXME bugzilla.eazel.com 2760:
117 * should return error here
120 read_string_val (const char *scanner, char *intobuf, int max_len, guint16 *into_len)
125 intobufend = intobuf + max_len - 1;
128 while (*scanner && !g_ascii_isspace (*scanner) && *scanner != '#') {
137 ch = read_hex_byte (&scanner);
143 /* read octal value */
144 ch = read_octal_byte (&scanner);
151 /* everything else is a literal */
159 /* already setup c/moved scanner */
161 if (intobuf < intobufend) {
172 read_hex_pattern (const char *scanner, char *result, int length)
174 if (*scanner == '0') {
177 if (*scanner++ != 'x') {
180 for (;length > 0; length--) {
181 if (!is_hex_digit (scanner[0]) || !is_hex_digit (scanner[1])) {
184 *result++ = read_hex_byte (&scanner);
191 read_num_val(const char **offset, int bsize, int *result)
193 char fmttype, fmtstr[4];
194 const char *scanner = *offset;
196 if (*scanner == '0') {
197 if (g_ascii_tolower (scanner[1]) == 'x') {
211 if (sscanf (scanner, fmtstr, result) < 1) {
220 if (sscanf (scanner, fmtstr, result) < 1) {
228 if (sscanf (scanner, fmtstr, result) < 1) {
234 while (**offset && !g_ascii_isspace (**offset)) {
242 eat_white_space (const char *scanner)
244 while (g_ascii_isspace (*scanner)) {
251 match_pattern (const char *scanner, const char **resulting_scanner, const char *pattern)
253 if (strncmp(scanner, pattern, strlen (pattern)) == 0) {
254 *resulting_scanner = scanner + strlen (pattern);
257 *resulting_scanner = scanner;
262 _gnome_vfs_mime_magic_parse (const gchar *filename, gint *nents)
265 GnomeMagicEntry newent, *retval;
267 const char *infile_name;
269 char parsed_line [256];
273 infile_name = filename;
279 infile = fopen (infile_name, "r");
284 array = g_array_new (FALSE, FALSE, sizeof (GnomeMagicEntry));
286 while (fgets (parsed_line, sizeof (parsed_line), infile)) {
287 scanner = parsed_line;
290 scanner = eat_white_space (scanner);
292 if (!*scanner || *scanner == '#') {
296 if (!g_ascii_isdigit (*scanner)) {
300 if (sscanf (scanner, "%hu", &newent.range_start) < 1) {
303 newent.range_end = newent.range_start;
305 while (g_ascii_isdigit (*scanner)) {
306 scanner++; /* eat the offset */
309 if (*scanner == ':') {
310 /* handle an offset range */
312 if (sscanf (scanner, "%hu", &newent.range_end) < 1) {
317 while (*scanner && !g_ascii_isspace (*scanner)) {
318 scanner++; /* eat the offset */
321 scanner = eat_white_space (scanner);
323 if (!*scanner || *scanner == '#') {
327 if (match_pattern (scanner, &scanner, "byte")) {
328 newent.type = T_BYTE;
329 } else if (match_pattern (scanner, &scanner, "short")) {
330 newent.type = T_SHORT;
331 } else if (match_pattern (scanner, &scanner, "long")) {
332 newent.type = T_LONG;
333 } else if (match_pattern (scanner, &scanner, "string")) {
335 } else if (match_pattern (scanner, &scanner, "date")) {
336 newent.type = T_DATE;
337 } else if (match_pattern (scanner, &scanner, "beshort")) {
338 newent.type = T_BESHORT;
339 } else if (match_pattern (scanner, &scanner, "belong")) {
340 newent.type = T_BELONG;
341 } else if (match_pattern (scanner, &scanner, "bedate")) {
342 newent.type = T_BEDATE;
343 } else if (match_pattern (scanner, &scanner, "leshort")) {
344 newent.type = T_LESHORT;
345 } else if (match_pattern (scanner, &scanner, "lelong")) {
346 newent.type = T_LELONG;
347 } else if (match_pattern (scanner, &scanner, "ledate")) {
348 newent.type = T_LEDATE;
350 continue; /* weird type */
352 scanner = eat_white_space (scanner);
353 if (!*scanner || *scanner == '#') {
357 switch (newent.type) {
385 if (newent.type == T_STR) {
386 scanner = read_string_val (scanner, newent.pattern,
387 sizeof (newent.pattern), &newent.pattern_length);
389 newent.pattern_length = bsize;
390 if (!read_num_val (&scanner, bsize, (int *)&newent.pattern)) {
395 scanner = eat_white_space (scanner);
396 if (!*scanner || *scanner == '#') {
400 if (*scanner == '&') {
402 scanner = read_hex_pattern (scanner, &newent.mask [0], newent.pattern_length);
404 g_error ("bad mask");
407 newent.use_mask = TRUE;
409 for (index = 0; index < newent.pattern_length; index++) {
410 /* Apply the mask to the pattern itself so we don't have to
411 * do it each time we compare it with the tested bytes.
413 newent.pattern[index] &= newent.mask[index];
416 newent.use_mask = FALSE;
419 scanner = eat_white_space (scanner);
420 if (!*scanner || *scanner == '#') {
424 g_snprintf (newent.mimetype, sizeof (newent.mimetype), "%s", scanner);
425 bsize = strlen (newent.mimetype) - 1;
426 while (newent.mimetype [bsize] && g_ascii_isspace (newent.mimetype [bsize])) {
427 newent.mimetype [bsize--] = '\0';
430 g_array_append_val (array, newent);
435 g_array_append_val (array, newent);
437 retval = (GnomeMagicEntry *)array->data;
442 g_array_free (array, FALSE);
448 endian_swap (guchar *result, const guchar *data, gsize length)
450 const guchar *source_ptr = data;
451 guchar *dest_ptr = result + length - 1;
452 while (dest_ptr >= result) {
453 *dest_ptr-- = *source_ptr++;
457 #if G_BYTE_ORDER == G_LITTLE_ENDIAN
458 #define FIRST_ENDIAN_DEPENDENT_TYPE T_BESHORT
459 #define LAST_ENDIAN_DEPENDENT_TYPE T_BEDATE
461 #define FIRST_ENDIAN_DEPENDENT_TYPE T_LESHORT
462 #define LAST_ENDIAN_DEPENDENT_TYPE T_LEDATE
466 try_one_pattern_on_buffer (const char *sniffed_stream, GnomeMagicEntry *magic_entry)
468 gboolean using_cloned_pattern;
469 char pattern_clone [48];
473 using_cloned_pattern = FALSE;
474 if (magic_entry->type >= FIRST_ENDIAN_DEPENDENT_TYPE && magic_entry->type <= LAST_ENDIAN_DEPENDENT_TYPE) {
475 /* Endian-convert the data we are trying to recognize to
476 * our host endianness.
478 char swap_buffer [sizeof(magic_entry->pattern)];
480 g_assert(magic_entry->pattern_length <= 4);
482 memcpy (swap_buffer, sniffed_stream, magic_entry->pattern_length);
484 endian_swap (pattern_clone, swap_buffer, magic_entry->pattern_length);
485 sniffed_stream = &pattern_clone[0];
486 using_cloned_pattern = TRUE;
489 if (magic_entry->use_mask) {
490 /* Apply mask to the examined data. At this point the data in
491 * sniffed_stream is in the same endianness as the mask.
494 if (!using_cloned_pattern) {
495 memcpy (pattern_clone, sniffed_stream, magic_entry->pattern_length);
496 using_cloned_pattern = TRUE;
497 sniffed_stream = &pattern_clone[0];
500 for (index = 0; index < magic_entry->pattern_length; index++) {
501 pattern_clone[index] &= magic_entry->mask[index];
505 if (*magic_entry->pattern != *sniffed_stream) {
509 for (count = magic_entry->pattern_length, pattern = magic_entry->pattern;
510 count > 0; count--) {
511 if (*pattern++ != *sniffed_stream++) {
519 SNIFF_BUFFER_CHUNK = 32
524 gnome_vfs_mime_try_one_magic_pattern (GnomeVFSMimeSniffBuffer *sniff_buffer,
525 GnomeMagicEntry *magic_entry)
529 if (sniff_buffer->read_whole_file &&
530 sniff_buffer->buffer_length < magic_entry->range_end + magic_entry->pattern_length) {
531 /* There's no place this pattern could actually match */
535 for (offset = magic_entry->range_start; offset <= magic_entry->range_end; offset++) {
536 /* this check is done only as an optimization
537 * _gnome_vfs_mime_sniff_buffer_get already implements the laziness.
538 * This gets called a million times though and every bit performance
539 * is valuable. This way we avoid making the call.
542 if (sniff_buffer->buffer_length < offset + magic_entry->pattern_length) {
544 if (!sniff_buffer->read_whole_file) {
545 if (_gnome_vfs_mime_sniff_buffer_get (sniff_buffer,
546 offset + magic_entry->pattern_length) != GNOME_VFS_OK) {
550 /* We have the entire file and the pattern won't fit. Return FALSE */
555 if (try_one_pattern_on_buffer (sniff_buffer->buffer + offset, magic_entry)) {
562 /* We lock this mutex whenever we modify global state in this module. */
563 G_LOCK_DEFINE_STATIC (mime_magic_table_mutex);
565 static GnomeMagicEntry *mime_magic_table = NULL;
567 static GnomeMagicEntry *
568 gnome_vfs_mime_get_magic_table (void)
570 G_LOCK (mime_magic_table_mutex);
572 if (mime_magic_table == NULL) {
573 mime_magic_table = _gnome_vfs_mime_magic_parse
574 (SYSCONFDIR "/gnome-vfs-mime-magic" , NULL);
577 G_UNLOCK (mime_magic_table_mutex);
579 return mime_magic_table;
583 _gnome_vfs_mime_get_type_from_magic_table (GnomeVFSMimeSniffBuffer *buffer)
585 GnomeMagicEntry *magic_table;
587 magic_table = gnome_vfs_mime_get_magic_table ();
588 if (magic_table == NULL) {
592 for (; magic_table->type != T_END; magic_table++) {
593 if (gnome_vfs_mime_try_one_magic_pattern (buffer, magic_table)) {
594 return magic_table->mimetype;
602 gnome_vfs_mime_test_get_magic_table (const char *table_path)
604 G_LOCK (mime_magic_table_mutex);
605 if (mime_magic_table == NULL) {
606 mime_magic_table = _gnome_vfs_mime_magic_parse (table_path, NULL);
608 G_UNLOCK (mime_magic_table_mutex);
610 return mime_magic_table;
613 #define HEX_DIGITS "0123456789abcdef"
616 print_escaped_string (const guchar *string, int length)
618 for (; length > 0; length--, string++) {
619 if (*string == '\\' || *string == '#') {
620 /* escape \, #, etc. properly */
621 printf ("\\%c", *string);
622 } else if (g_ascii_isgraph (*string)) {
623 /* everything printable except for white space can go directly */
624 printf ("%c", *string);
626 /* everything else goes in hex */
627 printf ("\\x%c%c", HEX_DIGITS[(*string) / 16], HEX_DIGITS[(*string) % 16]);
633 print_hex_pattern (const guchar *string, int length)
636 for (; length > 0; length--, string++) {
637 printf ("%c%c", HEX_DIGITS[(*string) / 16], HEX_DIGITS[(*string) % 16]);
641 gnome_vfs_mime_dump_magic_table (void)
643 GnomeMagicEntry *magic_table;
645 magic_table = gnome_vfs_mime_get_magic_table ();
646 if (magic_table == NULL) {
650 for (; magic_table->type != T_END; magic_table++) {
651 printf ("%d", magic_table->range_start);
652 if (magic_table->range_start != magic_table->range_end) {
653 printf (":%d", magic_table->range_end);
656 switch (magic_table->type) {
694 print_escaped_string (magic_table->pattern, magic_table->pattern_length);
695 if (magic_table->use_mask) {
697 print_hex_pattern (magic_table->mask, magic_table->pattern_length);
699 printf ("\t%s\n", magic_table->mimetype);
704 _gnome_vfs_mime_clear_magic_table (void)
706 G_LOCK (mime_magic_table_mutex);
707 g_free (mime_magic_table);
708 mime_magic_table = NULL;
709 G_UNLOCK (mime_magic_table_mutex);
713 * gnome_vfs_get_mime_type_for_buffer:
714 * @buffer: a sniff buffer referencing either a file or data in memory
716 * This routine uses a magic database to guess the mime type of the
717 * data represented by @buffer.
719 * Returns a pointer to an internal copy of the mime-type for @buffer.
722 gnome_vfs_get_mime_type_for_buffer (GnomeVFSMimeSniffBuffer *buffer)
724 return _gnome_vfs_get_mime_type_internal (buffer, NULL);
728 GNOME_VFS_TEXT_SNIFF_LENGTH = 256
733 * _gnome_vfs_sniff_buffer_looks_like_text:
734 * @sniff_buffer: buffer to examine
736 * Return value: returns %TRUE if the contents of @sniff_buffer appear to
740 _gnome_vfs_sniff_buffer_looks_like_text (GnomeVFSMimeSniffBuffer *sniff_buffer)
744 _gnome_vfs_mime_sniff_buffer_get (sniff_buffer, GNOME_VFS_TEXT_SNIFF_LENGTH);
746 if (sniff_buffer->buffer_length == 0) {
750 if (g_utf8_validate (sniff_buffer->buffer,
751 sniff_buffer->buffer_length, (const gchar**)&end))
755 /* Check whether the string was truncated in the middle of
756 * a valid UTF8 char, or if we really have an invalid
759 gint remaining_bytes = sniff_buffer->buffer_length;
761 remaining_bytes -= (end-((gchar*)sniff_buffer->buffer));
763 if (g_utf8_get_char_validated(end, remaining_bytes) == -2)
765 #if defined(HAVE_WCTYPE_H) && defined (HAVE_MBRTOWC)
772 src = sniff_buffer->buffer;
773 end = sniff_buffer->buffer + sniff_buffer->buffer_length;
775 memset (&state, 0, sizeof (state));
777 /* Don't allow embedded zeros in textfiles */
781 wlen = mbrtowc(&wc, src, end - src, &state);
783 if (wlen == (size_t)(-1)) {
784 /* Illegal mb sequence */
788 if (wlen == (size_t)(-2)) {
789 /* No complete mb char before end
790 * Probably a cut off char which is ok */
795 /* Don't allow embedded zeros in textfiles */
799 if (!iswspace (wc) && !iswprint(wc)) {
800 /* Not a printable or whitspace
801 * Probably not a text file */
809 #endif /* defined(HAVE_WCTYPE_H) && defined (HAVE_MBRTOWC) */
814 static int bitrates[2][15] = {
815 { 0, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320},
816 { 0, 8, 16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 144, 160 }
819 static int frequencies[2][3] = {
820 { 44100, 48000, 32000 },
821 { 22050, 24000, 16000 }
825 * Return length of an MP3 frame using potential 32-bit header value. See
826 * "http://www.dv.co.yu/mpgscript/mpeghdr.htm" for details on the header
829 * NOTE: As an optimization and because they are rare, this returns 0 for
830 * version 2.5 or free format MP3s.
833 get_mp3_frame_length (unsigned long mp3_header)
835 int ver = 4 - ((mp3_header >> 19) & 3u);
836 int br = (mp3_header >> 12) & 0xfu;
837 int srf = (mp3_header >> 10) & 3u;
839 /* are frame sync and layer 3 bits set? */
840 if (((mp3_header & 0xffe20000ul) == 0xffe20000ul)
842 && ((ver == 1) || (ver == 2))
843 /* good bitrate index (not free or invalid)? */
844 && (br > 0) && (br < 15)
845 /* good sampling rate frequency index? */
847 /* not using reserved emphasis value? */
848 && ((mp3_header & 3u) != 2)) {
849 /* then this is most likely the beginning of a valid frame */
851 gsize length = (gsize) bitrates[ver - 1][br] * 144000;
852 length /= frequencies[ver - 1][srf];
853 return length += ((mp3_header >> 9) & 1u) - 4;
859 get_4_byte_value (const unsigned char *bytes)
861 unsigned long value = 0;
864 for (count = 0; count < 4; ++count) {
872 GNOME_VFS_MP3_SNIFF_LENGTH = 256
876 * _gnome_vfs_sniff_buffer_looks_like_mp3:
877 * @sniff_buffer: buffer to examine
879 * Return value: returns %TRUE if the contents of @sniff_buffer appear to
883 _gnome_vfs_sniff_buffer_looks_like_mp3 (GnomeVFSMimeSniffBuffer *sniff_buffer)
885 unsigned long mp3_header;
888 if (_gnome_vfs_mime_sniff_buffer_get (sniff_buffer, GNOME_VFS_MP3_SNIFF_LENGTH) != GNOME_VFS_OK) {
893 * Use algorithm described in "ID3 tag version 2.3.0 Informal Standard"
894 * at "http://www.id3.org/id3v2.3.0.html" to detect a valid header, "An
895 * ID3v2 tag can be detected with the following pattern:
896 * $49 44 33 yy yy xx zz zz zz zz
897 * Where yy is less than $FF, xx is the 'flags' byte and zz is less than
900 * The informal standard also says, "The ID3v2 tag size is encoded with
901 * four bytes where the most significant bit (bit 7) is set to zero in
902 * every byte, making a total of 28 bits. The zeroed bits are ignored,
903 * so a 257 bytes long tag is represented as $00 00 02 01."
905 if (strncmp ((char *) sniff_buffer->buffer, "ID3", 3) == 0
906 && (sniff_buffer->buffer[3] != 0xffu)
907 && (sniff_buffer->buffer[4] != 0xffu)
908 && (sniff_buffer->buffer[6] < 0x80u)
909 && (sniff_buffer->buffer[7] < 0x80u)
910 && (sniff_buffer->buffer[8] < 0x80u)
911 && (sniff_buffer->buffer[9] < 0x80u)) {
912 /* checks for existance of vorbis identification header */
913 for (offset=10; offset < GNOME_VFS_MP3_SNIFF_LENGTH-7; offset++) {
914 if (strncmp ((char *) &sniff_buffer->buffer[offset],
915 "\x01vorbis", 7) == 0) {
923 * Scan through the first "GNOME_VFS_MP3_SNIFF_LENGTH" bytes of the
924 * buffer to find a potential 32-bit MP3 frame header.
927 for (offset = 0; offset < GNOME_VFS_MP3_SNIFF_LENGTH; offset++) {
931 mp3_header |= sniff_buffer->buffer[offset];
932 mp3_header &= 0xfffffffful;
934 length = get_mp3_frame_length (mp3_header);
938 * Since one frame is available, is there another frame
939 * just to be sure this is more likely to be a real MP3
942 offset += 1 + length;
944 if (_gnome_vfs_mime_sniff_buffer_get (sniff_buffer, offset + 4) != GNOME_VFS_OK) {
947 mp3_header = get_4_byte_value (&sniff_buffer->buffer[offset]);
948 length = get_mp3_frame_length (mp3_header);