Drop "MS-Windows XP " prefix from all the 'id's.
[captive.git] / src / install / acquire / captivemodid-print.pl
1 #! /usr/bin/perl
2 #
3 # $Id$
4 # Scan downloaded Microsoft files to build the .captivemodid.xml file core.
5 # Copyright (C) 2005 Jan Kratochvil <project-captive@jankratochvil.net>
6
7 # This program is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; exactly version 2 of June 1991 is required
10
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 # GNU General Public License for more details.
15
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19
20
21 use strict;
22 use warnings;
23
24 use Cwd;
25 use Carp qw(confess cluck);
26 use Digest::MD5 qw(md5_hex);
27 use File::Remove qw(remove);
28 use Getopt::Long;
29 use Data::Dumper;
30
31
32 my $tmp="/tmp/captivemodid";
33 my $out;
34
35
36 my $opt_quiet;
37 die if !GetOptions(
38                 "q|quiet+",\$opt_quiet,
39                 );
40
41
42 our $dirnew=$ARGV[0];
43 do { $dirnew="."; warn "Defaulting ARGV[0] to: $dirnew"; } if !$dirnew;
44 our $out=$ARGV[1];
45 warn "No output md5 directory, not copying." if !$out;
46
47
48 my %type=(
49         "ntoskrnl.exe"=>"Kernel",
50         "ntkrnlpa.exe"=>"Kernel PA",
51         "ntkrnlmp.exe"=>"Kernel SMP",
52         "ntkrpamp.exe"=>"Kernel SMP PA",
53         "cdfs.sys"    =>"CD-ROM/iso-9660",
54         "fastfat.sys" =>"FastFAT/vfat",
55         "ntfs.sys"    =>"NTFS",
56         );
57
58 my %lang=(
59         "ar"=>"Arabic",
60         "hk"=>"Chinese (Hong Kong SAR)",
61         "cn"=>"Chinese (Simplified)",
62         "tw"=>"Chinese (Traditional)",
63         "cs"=>"Czech",
64         "da"=>"Danish",
65         "nl"=>"Dutch",
66         "en"=>"English",
67         "fi"=>"Finnish",
68         "fr"=>"French",
69         "de"=>"German",
70         "el"=>"Greek",
71         "he"=>"Hebrew",
72         "hu"=>"Hungarian",
73         "it"=>"Italian",
74         "ja"=>"Japanese",
75         "ko"=>"Korean",
76         "no"=>"Norwegian",
77         "pl"=>"Polish",
78         "br"=>"Portugese (Brazil)",
79         "pt"=>"Portugese (Portugal)",
80         "ru"=>"Russian",
81         "es"=>"Spanish",
82         "sv"=>"Swedish",
83         "tr"=>"Turkish",
84         "ara"=>"Arabic",
85         "chh"=>"Chinese (Hong Kong SAR)",
86         "chs"=>"Chinese (Simplified)",
87         "cht"=>"Chinese (Traditional)",
88         "csy"=>"Czech",
89         "dan"=>"Danish",
90         "nld"=>"Dutch",
91         "enu"=>"English",
92         "fin"=>"Finnish",
93         "fra"=>"French",
94         "deu"=>"German",
95         "ell"=>"Greek",
96         "heb"=>"Hebrew",
97         "hun"=>"Hungarian",
98         "ita"=>"Italian",
99         "jpn"=>"Japanese",
100         "kor"=>"Korean",
101         "nor"=>"Norwegian",
102         "plk"=>"Polish",
103         "ptb"=>"Portugese (Brazil)",
104         "ptg"=>"Portugese (Portugal)",
105         "rus"=>"Russian",
106         "esn"=>"Spanish",
107         "sve"=>"Swedish",
108         "trk"=>"Turkish",
109         );
110
111
112 sub name_to_lang($)
113 {
114 my($name)=@_;
115
116         $name=~tr/-/_/;
117         $name=lc $name;
118         my $r;
119         while (my($subst,$long)=each(%lang)) {
120                 next if $name!~/[^a-z]$subst[^a-z]/;
121 # FIXME
122                 cluck "$name: $subst" if $r;
123                 $r=$long;
124                 }
125 # FIXME
126         cluck $name if !$r;
127         return $r;
128 }
129
130 my @checked_build=qw(
131         chk
132         debug
133         );
134
135 sub name_is_debug($)
136 {
137 my($name)=@_;
138
139         $name=~tr/-/_/;
140         $name=lc $name;
141         for my $subst (@checked_build) {
142                 return 1 if $name=~/[^a-z]$subst[^a-z]/;
143                 }
144         return;
145 }
146
147 my $ver_prefix="5.1.2600.";
148 my @stack_out;
149 my $STDERR_needs_eol;
150 my %md5sum_printed_global;
151
152 sub stack0_flush()
153 {
154         return if $opt_quiet;
155         # FIXME: Proper nesting, not just these 2 levels:
156         my %md5sum_printed_local;
157         while (my $stack=shift @stack_out) {
158                 for my $stacki (0..$#$stack) {
159                         my $this=$stack->[$stacki];
160                         next if $md5sum_printed_local{$this->{"md5"}}++;
161                         my $dupe_global=$md5sum_printed_global{$this->{"md5"}}++;
162                         my $print="";
163                         $print.=<<"HERE";
164 <!-- @{[ $this->{"relname"} ]} -->
165 HERE
166                         $print.=<<"HERE" if !$dupe_global;
167 <module type="@{[ $this->{"type"} ]}" length="@{[ $this->{"length"} ]}" priority="@{[ $this->{"priority"} ]}" md5="@{[ $this->{"md5"} ]}"
168                 id="@{[ $this->{"dist"} ]} @{[ !$this->{"is_debug"} ? "" : "Checked Build "]}@{[ $this->{"lang"} ]} $ver_prefix@{[ $this->{"ver"} ]} @{[ $this->{"this_name"} ]}" />
169 HERE
170                         $print=~s/^/"\t" x (1+$stacki)/egm;
171                         print STDERR "\n" if $STDERR_needs_eol;
172                         $STDERR_needs_eol=0;
173                         print $print;
174                         }
175                 }
176         print "\n";
177 }
178
179 our @stack;
180 my %md5sum_stored;
181 my $last_stack0_filename;
182
183 sub check($$)
184 {
185 my($filename_unused,$basename_orig)=@_;
186
187         return if !$type{lc $basename_orig};
188         my $final_type=$basename_orig;
189         my $final_name=$type{$final_type} or confess;
190         $final_type="ntoskrnl.exe" if $final_type=~/^nt.*[.]exe$/;
191         (my $basename_orig_=$basename_orig)=~s/.$/_/;
192         (my $basename0=$stack[0]->{"filename"})=~s{^.*/}{} or confess;
193         stack0_flush() if $last_stack0_filename && $last_stack0_filename ne $stack[0]->{"filename"};
194         $last_stack0_filename=$stack[0]->{"filename"};
195         for my $stacki (0..$#stack) {
196                 my $this=$stack[$stacki];
197                 my $filename=$this->{"filename"} or confess;
198                 my $relname=$this->{"relname"} or confess;
199                 (my $basename=$filename)=~s{^.*/}{} or confess;
200                 my $expanded=$stack[$#stack]->{"filename"};
201                 my $this_name;
202                 my $type;
203                 if ($stacki==$#stack-1 && $basename eq $basename_orig_) {
204                         $this->{"type"} eq "cabinet" or confess;
205                         $type=$this->{"type"} or confess;
206                         $this_name="$final_name Cabinet";
207                         }
208                 elsif ($stacki<$#stack) {
209                         $type=$this->{"type"} or confess;
210                         $this_name="Cabinet";
211                         }
212                 else {
213                         $type=$final_type or confess;
214                         $this_name=$final_name;
215                         }
216                 my $length=(stat $filename)[7] or confess;
217                 my $dist=$stack[0]->{"filename"};
218                 $dist=~s{^.*/}{} or confess;
219                 my $lang=name_to_lang $basename0;
220                 local *F;
221                 open F,$expanded or confess;
222                 my $buf="";
223                 my $buf2;
224                 my $ver;
225                 while (my $got=read F,$buf2,0x1000) {
226                         confess $expanded if !defined $got;
227                         last if !$got;
228                         confess $expanded if $got<0;
229                         confess $expanded if length($buf2)!=$got;
230                         $buf.=$buf2;
231                         $ver=$buf;
232                         $ver=~s/\x00\x00+/!/g;
233                         $ver=~tr/\x00//d;
234                         $ver=~tr/!/\x00/;
235                         last if $ver=~s/^.*\x00\Q$ver_prefix\E(\d+)[ \x00].*$/$1/s;
236                         $ver=undef();
237                         $buf=substr($buf,-0x100);
238                         }
239                 confess $expanded if !$ver;
240                 close F or confess;
241                 $ver=~/^\d+$/ or confess "$expanded: $ver";
242                 my $pri=sprintf "510%04u00",$ver;
243                 $pri+=90;
244                 $pri-=10 if $lang ne "English";
245                 $pri-=20 if $basename_orig=~/^ntkrnlpa/;
246                 $pri-=40 if $basename_orig=~/^ntkrnlmp/;
247                 $pri-=60 if $basename_orig=~/^ntkrpamp/;
248                 $pri+=5000000 if my $is_debug=name_is_debug $basename0;
249                 local *F;
250                 open F,$filename or confess;
251                 my $md5obj=Digest::MD5->new();
252                 $md5obj->addfile(*F);
253                 close F or confess;
254                 my $md5sum=lc $md5obj->hexdigest();
255                 spawn("cp -p '$filename' '$out/$md5sum'") if !$md5sum_stored{$md5sum}++ && $out && $stacki==$#stack;
256
257                 my %new=(
258                         %$this,
259                         "type"=>$type,
260                         "length"=>$length,
261                         "priority"=>$pri,
262                         "md5"=>$md5sum,
263                         "dist"=>$dist,
264                         "lang"=>$lang,
265                         "ver"=>$ver,
266                         "this_name"=>$this_name,
267                         "is_debug"=>$is_debug,
268                         );
269                 # Highest priority/version of all the files in the cabinet:
270                 for (qw(ver priority)) {
271                         next if !$this->{$_};
272                         $new{$_}=$this->{$_} if $new{$_}<$this->{$_};
273                         delete $this->{$_} if $this->{$_}<$new{$_};
274                         }
275                 do { cluck Dumper($this,\%new) if $new{$_} ne $this->{$_}; } for keys(%$this);
276                 %$this=%new;
277                 }
278         push @stack_out,[@stack];
279 }
280
281 sub spawn($)
282 {
283 my($command)=@_;
284
285         my $code=system "($command)".' >&2';
286         confess "$code: $command" if $code;
287 }
288
289 sub rm_rf($)
290 {
291 my($dir)=@_;
292
293         return if !-e $dir;
294         # Do not: Can't remove directory xyzzy: Directory not empty
295         #         -r--r--r-- xyzzy
296         #         remove \1,$dir or confess "$dir: $!";
297         spawn "rm -rf '$dir'";
298 }
299
300 sub mkdir_checked
301 {
302 my(@dirs)=@_;
303
304         for (@dirs) {
305                 mkdir $_ or confess "$_: $!";
306                 }
307 }
308
309
310 our $depth=-1;
311
312 sub prep()
313 {
314         confess if $dirnew;
315         $dirnew="$tmp/$depth";
316         chdir "/" or confess;
317         rm_rf $dirnew;
318         mkdir_checked $dirnew;
319         chdir $dirnew or confess "chdir: $dirnew";
320         return $dirnew;
321 }
322
323 sub relname($)
324 {
325 my($filename)=@_;
326
327         return $filename if $filename=~s{^\Q$tmp\E/\d+/}{};
328         (my $basename=$filename)=~s{^.*/}{} or confess;
329         return $basename;
330 }
331
332 my %unknown;
333 sub process();
334 sub process()
335 {
336         my $dirname=$dirnew or confess;
337         $dirnew=undef();
338         local $depth=$depth+1;
339         my @stack_orig=@stack;
340
341         local *FIND;
342         open FIND,"find '$dirname' -type f|sort|" or confess $dirname;
343         local $_;
344         while (<FIND>) {
345                 next if $_ eq "";
346                 if (!$depth) {
347                         print STDERR ".";
348                         $STDERR_needs_eol=1;
349                         }
350                 chomp;
351                 my $filename=$_;
352                 $filename=~m{^/} or confess $filename;
353                 (my $basename=$filename)=~s{^.*/}{} or confess;
354                 -r $filename or do { cluck $filename; system("ls -l '$filename'"); };
355                 my $this={
356                         "filename"=>$filename,
357                         "relname"=>relname($filename),
358                         };
359                 @stack=(@stack_orig,$this);
360
361                 check $filename,$basename;
362
363                 local *MIME;
364                 open MIME,"file -b -i '$filename'|" or confess;
365                 my $mime=do { local $/; <MIME> or confess; };
366                 close MIME or confess;
367                 chomp $mime;
368                 next if $mime=~m{^text/};
369                 next if $mime=~m{^image/};
370                 next if $mime=~m{^audio/};
371                 next if $mime eq 'application/x-empty';
372                 next if $mime eq 'application/msaccess';
373
374                 local *ID;
375                 open ID,"file -b '$filename'|" or confess;
376                 my $id=do { local $/; <ID> or confess; };
377                 close ID or confess;
378                 chomp $id;
379                 next if $id eq 'Microsoft Office Document';
380                 next if $id=~/^PDF document, version 1[.]\d+$/;
381                 next if $id eq 'Rich Text Format data, version 1,';
382                 next if $id=~/^x86 boot sector, /;
383                 next if $id eq 'data';
384                 next if $id eq 'MZ executable for MS-DOS';
385                 next if $id eq 'XML document text';
386                 next if $id eq 'MS Windows HtmlHelp Data';
387                 next if $id eq 'MPEG sequence';
388                 next if $id=~/^MSVC program database ver /;
389                 next if $id eq 'Lotus 1-2-3';
390                 next if $id eq 'MS Windows Help Data';
391                 next if $id=~/^Macromedia Flash data, /;
392                 next if $id eq 'Microsoft ASF'; # .wmv
393                 next if $id eq 'GLF_BINARY_LSB_FIRST';  # .hlp
394                 next if $id=~/^Sendmail frozen configuration /;
395                 next if $id eq 'Assembler source';
396                 next if $id=~/^DBase 3 data file/;
397                 next if $id=~/^Macintosh HFS Extended version /;
398                 next if $id=~/^MPEG ADTS, /;    # .txt?,.hlp?
399                 next if $id eq 'MS-DOS executable (COM)';
400                 next if $id eq 'NE executable for MS Windows 3.x (driver)';
401                 next if $id eq 'TrueType font data';
402                 next if $id eq 'Windows NT registry file';
403                 next if $id eq 'lif file';      # .sig?
404                 next if $id=~/^Infocom game data /;     # ._p?
405                 next if $id=~/^Sun disk label /;        # .dl_?
406                 next if $id=~/^compiled Java class data, /;
407                 next if $id=~/^Minix filesystem/;       # .dl_?,_p?
408                 next if $id eq 'DBase 3 index file';    # .nls?
409                 next if $id=~/^Macintosh MFS data /;    # .cp_?
410                 next if $id eq 'ACB archive data';      # .cat?
411                 next if $id eq 'COM executable for MS-DOS';     # .edb?
412                 next if $id=~/^Bio-Rad .PIC Image File/;        # ._p?
413                 next if $id eq 'SysEx File - Gulbransen';       # .edb?
414
415                 # Do not:
416                 #next if $id=~/^PE executable for MS Windows .* 32-bit$/;       # TODO
417
418                 if ($id eq 'PE executable for MS Windows (GUI) Intel 80386 32-bit, InnoSetup self-extracting archive') {
419                         # FIXME: http://innounp.sourceforge.net/
420                         next;
421                         }
422                 if (0
423                                 || $id=~/^Microsoft Cabinet archive data, \d+ byte(?:s)?, \d+ file(?:s)?$/
424                                 || $id eq 'PE executable for MS Windows (GUI) Intel 80386 32-bit, MS CAB-Installer self-extracting archive'
425                                 # Catch all unidentified flying cabinets:
426                                 || $id=~/^PE executable for MS Windows .* 32-bit$/
427                                 ) {
428                         $this->{"type"}="cabinet";
429                         prep();
430                         spawn q{cabextract -q '}.$filename.q{' 2>&1|grep -v ': \(}.join(q{\|},
431                                         q{WARNING; possible [0-9]* extra bytes at end of file.},
432                                         q{no valid cabinets found},
433                                         q{checksum error},
434                                         q{error in CAB data format},
435                                         ).q{\)$';true};
436                         process();
437                         next;
438                         }
439                 # non-"32-bit" (amd64 in fact)
440                 next if $id=~/^PE executable for MS Windows/;
441                 if (0
442                                 || $id eq 'PE executable for MS Windows (GUI) Intel 80386 32-bit, ZIP self-extracting archive (WinZip)'
443                                 || $id=~/^Zip archive data, /
444                                 ) {
445                         prep();
446                         spawn "unzip -q '$filename'";
447                         process();
448                         next;
449                         }
450                 if ($id=~m{^gzip compressed data, was "\S+", from Win/32}) {
451                         prep();
452                         spawn "gzip -d <'$filename' >'$basename'";
453                         process();
454                         next;
455                         }
456                 if ($id eq 'tar archive') {
457                         prep();
458                         spawn "tar xf '$filename'";
459                         process();
460                         next;
461                         }
462                 warn "$filename: $mime - $id" if !$unknown{$mime,$id}++;
463                 }
464         close FIND or confess;
465         $dirnew=undef();
466 }
467
468
469 $dirnew=getcwd()."/".$dirnew if $dirnew!~m{^/};
470 $out   =getcwd()."/".$out    if $out && $out!~m{^/};
471 rm_rf $tmp;
472 mkdir_checked $tmp;
473 if ($out) {
474         rm_rf $out;
475         mkdir_checked $out;
476         }
477 process;
478 stack0_flush() if $last_stack0_filename;
479 print STDERR "\n" if $STDERR_needs_eol;
480 $STDERR_needs_eol=0;