+Microsoft files downloader/extractor/.captivemodid.xml builders.
authorlace <>
Mon, 19 Dec 2005 07:30:46 +0000 (07:30 +0000)
committerlace <>
Mon, 19 Dec 2005 07:30:46 +0000 (07:30 +0000)
src/install/acquire/Makefile.am
src/install/acquire/captivemodid-list.pl [new file with mode: 0755]
src/install/acquire/captivemodid-print.pl [new file with mode: 0755]

index 7f97717..d55c791 100644 (file)
@@ -1,6 +1,6 @@
 # $Id$
 # automake source for drivers acquiring installation utility Makefile
-# Copyright (C) 2003 Jan Kratochvil <project-captive@jankratochvil.net>
+# Copyright (C) 2003-2005 Jan Kratochvil <project-captive@jankratochvil.net>
 # 
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -67,7 +67,9 @@ CLEANFILES+=gnome_vfs_read_entire_file.c
 
 EXTRA_DIST+= \
                $(GLADE_IN) \
-               cabextract/.vimrc
+               cabextract/.vimrc \
+               captivemodid-list.pl \
+               captivemodid-print.pl
 
 captive_install_acquire-ui-gnome-interface.$(OBJEXT): ui-gnome-callbacks.h
 
diff --git a/src/install/acquire/captivemodid-list.pl b/src/install/acquire/captivemodid-list.pl
new file mode 100755 (executable)
index 0000000..2a1f22b
--- /dev/null
@@ -0,0 +1,139 @@
+#! /usr/bin/perl
+#
+# $Id$
+# Extract the list of files to download from Microsoft.
+# Copyright (C) 2005 Jan Kratochvil <project-captive@jankratochvil.net>
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; exactly version 2 of June 1991 is required
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+use strict;
+use warnings;
+
+require LWP::Simple;
+use URI::Escape;
+
+
+sub unamp($)
+{
+my($ref)=@_;
+
+       die "non-amp amp: $$ref" if $$ref=~/&(?!amp;)/;
+       $$ref=~s/&amp;/&/go;
+}
+
+print STDERR "Downloading index: ";
+my $found_total;
+my @found;
+my $found_processed=0;
+my $next='results.aspx?freetext=&productID=4C937A02-BAE0-4317-A1A9-0C56CD979D05&categoryId=7&period=&sortCriteria=date&nr=50&DisplayLang=en&type=a';
+my $BASE='http://www.microsoft.com/downloads/';
+my $VALIDATION_CODE=$ARGV[0] or die "ARGV[0]=VALIDATION_CODE";
+
+while ($next) {
+       my $page=LWP::Simple::get($BASE.$next) or die;
+       my($page_total,$page_first,$page_last)=($page=~m{<b>(\d+)</b> results found; results <b>(\d+)-(\d+)</b> shown.}) or die;
+       die if !$page_total;
+       $found_total=$page_total if !$found_total;
+       $found_total==$page_total or die;
+       $found_processed+1==$page_first or die "(found_processed+1)(".($found_processed+1)."!=page_first($page_first)";
+       $page_last>=$page_first or die;
+       if ($page_last<$found_total) {
+               $page_last==$page_first+50-1 or die;
+               ($next)=($page=~m{<a href="([^"]*)">Next\s+&gt;</a>}) or die;
+               unamp \$next;
+               $next=~/^info.aspx[?]/ or die;
+               }
+       else {
+               $page_last==$found_total or die;
+               $next=undef();
+               }
+
+       while ($page=~m{<p><a href="([^"]*)">[^<]*</a>(?:&nbsp;<a href=[^>]*><img[^>]*\balt="Genuine Windows download"[^>]*></a>)?</p>}g) {
+               my($url)=($1);
+               $found_processed++;
+               unamp \$url;
+               push @found,$url;
+               }
+       print STDERR ".";
+       $found_processed==$page_last or die "found_processed(".($found_processed).")!=page_last($page_last)";
+       }
+print STDERR " found: ".(0+@found)."\n";
+
+my @download;
+print STDERR "Downloading product pages: ";
+for my $found (@found) {
+       my $page=LWP::Simple::get($BASE.$found) or die;
+       my($lang)=($page=~m{<select name="displaylang" [^>]*>((?:<option [^>]*>[^<>]*</option>)+)</select>});
+       # <option value="en" selected>English</option><option value="fr">French</option></select>
+       print STDERR "+";
+       my @stage2;
+       if (!$lang) {
+               @stage2=$found;
+               }
+       else {
+               while ($lang=~s{<option value="([^"]*)"(?: selected)?>([^<>]*)</option>}{}) {
+                       my($short,$long)=($1,$2);
+                       (my $found_lang=$found)=~s/((?:\b|%26)DisplayLang(?:=|%3d))en\b/$1$short/ or die $found;
+                       push @stage2,$found_lang;
+                       }
+               !$lang or die;
+               die if !@stage2;
+               }
+       for my $stage2 (@stage2) {
+               my $delim='&';
+               $delim='%26' if $stage2=~/%26DisplayLang/i;
+               $stage2.="${delim}Hash=$VALIDATION_CODE";
+               $stage2=$BASE.$stage2;
+               my $page=LWP::Simple::get($stage2) or die $stage2;
+               while ($page=~m{<h2>Object moved to <a href="([^"]*)">}) {
+                       $stage2=$1;
+                       unamp \$stage2;
+                       $page=LWP::Simple::get($stage2) or die $stage2;
+                       }
+               my $download;
+               while ($page=~m{window.open(?:.)'(http://download.microsoft.com/download/[^']*)',null,}g) {
+                       die if $download;
+                       $download=$1;
+                       }
+               if ($download) {
+                       push @download,$download;
+                       print STDERR ".";
+                       }
+               elsif ($page=~m{<table id="multiFileList"(.*?)</table>}) {
+                       my $files=$1;
+                       my $download_orig=@download;
+                       while ($files=~m{<a href="([^"]*)">[^<>]*</a>}g) {
+                               my $url=$1;
+                               unamp \$url;
+                               push @download,$url;
+                               }
+                       die if $download_orig==@download;
+                       print STDERR "_";
+                       }
+               elsif ($page=~m{<div id="regsysNotRegistered">}) {
+                       print STDERR "!";
+                       }
+               else {
+                       die $stage2;
+                       }
+               }
+       }
+for (@download) {
+       next if !/^info.aspx[?].*&u=(http%3a[^&]*)$/;
+       $_=uri_unescape $1;
+       }
+@download=sort keys(%{{ map(($_=>1),@download) }});
+print STDERR " found downloads: ".(0+@download)."\n";
+print "$_\n" for @download;
diff --git a/src/install/acquire/captivemodid-print.pl b/src/install/acquire/captivemodid-print.pl
new file mode 100755 (executable)
index 0000000..3b3988e
--- /dev/null
@@ -0,0 +1,461 @@
+#! /usr/bin/perl
+#
+# $Id$
+# Scan downloaded Microsoft files to build the .captivemodid.xml file core.
+# Copyright (C) 2005 Jan Kratochvil <project-captive@jankratochvil.net>
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; exactly version 2 of June 1991 is required
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+use strict;
+use warnings;
+
+use Cwd;
+use Carp qw(confess cluck);
+use Digest::MD5 qw(md5_hex);
+use File::Remove qw(remove);
+use Getopt::Long;
+use Data::Dumper;
+
+
+my $tmp="/tmp/captivemodid";
+my $out;
+
+
+my $opt_quiet;
+die if !GetOptions(
+               "q|quiet+",\$opt_quiet,
+               );
+
+
+our $dirnew=$ARGV[0];
+do { $dirnew="."; warn "Defaulting ARGV[0] to: $dirnew"; } if !$dirnew;
+our $out=$ARGV[1];
+warn "No output md5 directory, not copying." if !$out;
+
+
+my %type=(
+       "ntoskrnl.exe"=>"Kernel",
+       "ntkrnlpa.exe"=>"Kernel PA",
+       "ntkrnlmp.exe"=>"Kernel SMP",
+       "ntkrpamp.exe"=>"Kernel SMP PA",
+       "cdfs.sys"    =>"CD-ROM/iso-9660",
+       "fastfat.sys" =>"FastFAT/vfat",
+       "ntfs.sys"    =>"NTFS",
+       );
+
+my %lang=(
+       "ar"=>"Arabic",
+       "hk"=>"Chinese (Hong Kong SAR)",
+       "cn"=>"Chinese (Simplified)",
+       "tw"=>"Chinese (Traditional)",
+       "cs"=>"Czech",
+       "da"=>"Danish",
+       "nl"=>"Dutch",
+       "en"=>"English",
+       "fi"=>"Finnish",
+       "fr"=>"French",
+       "de"=>"German",
+       "el"=>"Greek",
+       "he"=>"Hebrew",
+       "hu"=>"Hungarian",
+       "it"=>"Italian",
+       "ja"=>"Japanese",
+       "ko"=>"Korean",
+       "no"=>"Norwegian",
+       "pl"=>"Polish",
+       "br"=>"Portugese (Brazil)",
+       "pt"=>"Portugese (Portugal)",
+       "ru"=>"Russian",
+       "es"=>"Spanish",
+       "sv"=>"Swedish",
+       "tr"=>"Turkish",
+       "ara"=>"Arabic",
+       "chh"=>"Chinese (Hong Kong SAR)",
+       "chs"=>"Chinese (Simplified)",
+       "cht"=>"Chinese (Traditional)",
+       "csy"=>"Czech",
+       "dan"=>"Danish",
+       "nld"=>"Dutch",
+       "enu"=>"English",
+       "fin"=>"Finnish",
+       "fra"=>"French",
+       "deu"=>"German",
+       "ell"=>"Greek",
+       "heb"=>"Hebrew",
+       "hun"=>"Hungarian",
+       "ita"=>"Italian",
+       "jpn"=>"Japanese",
+       "kor"=>"Korean",
+       "nor"=>"Norwegian",
+       "plk"=>"Polish",
+       "ptb"=>"Portugese (Brazil)",
+       "ptg"=>"Portugese (Portugal)",
+       "rus"=>"Russian",
+       "esn"=>"Spanish",
+       "sve"=>"Swedish",
+       "trk"=>"Turkish",
+       );
+
+
+sub name_to_lang($)
+{
+my($name)=@_;
+
+       $name=~tr/-/_/;
+       $name=lc $name;
+       my $r;
+       while (my($subst,$long)=each(%lang)) {
+               next if $name!~/[^a-z]$subst[^a-z]/;
+# FIXME
+               cluck "$name: $subst" if $r;
+               $r=$long;
+               }
+# FIXME
+       cluck $name if !$r;
+       return $r;
+}
+
+my $ver_prefix="5.1.2600.";
+my @stack_out;
+my $STDERR_needs_eol;
+my %md5sum_printed_global;
+
+sub stack0_flush()
+{
+       return if $opt_quiet;
+       # FIXME: Proper nesting, not just these 2 levels:
+       my %md5sum_printed_local;
+       while (my $stack=shift @stack_out) {
+               for my $stacki (0..$#$stack) {
+                       my $this=$stack->[$stacki];
+                       next if $md5sum_printed_local{$this->{"md5"}}++;
+                       my $dupe_global=$md5sum_printed_global{$this->{"md5"}}++;
+                       my $print="";
+                       $print.=<<"HERE";
+<!-- @{[ $this->{"relname"} ]} -->
+HERE
+                       $print.=<<"HERE" if !$dupe_global;
+<module type="@{[ $this->{"type"} ]}" length="@{[ $this->{"length"} ]}" priority="@{[ $this->{"priority"} ]}" md5="@{[ $this->{"md5"} ]}"
+               id="MS-Windows XP @{[ $this->{"dist"} ]} @{[ $this->{"lang"} ]} $ver_prefix@{[ $this->{"ver"} ]} @{[ $this->{"this_name"} ]}" />
+HERE
+                       $print=~s/^/"\t" x (1+$stacki)/egm;
+                       print STDERR "\n" if $STDERR_needs_eol;
+                       $STDERR_needs_eol=0;
+                       print $print;
+                       }
+               }
+       print "\n";
+}
+
+our @stack;
+my %md5sum_stored;
+my $last_stack0_filename;
+
+sub check($$)
+{
+my($filename_unused,$basename_orig)=@_;
+
+       return if !$type{lc $basename_orig};
+       my $final_type=$basename_orig;
+       my $final_name=$type{$final_type} or confess;
+       $final_type="ntoskrnl.exe" if $final_type=~/^nt.*[.]exe$/;
+       (my $basename_orig_=$basename_orig)=~s/.$/_/;
+       (my $basename0=$stack[0]->{"filename"})=~s{^.*/}{} or confess;
+       stack0_flush() if $last_stack0_filename && $last_stack0_filename ne $stack[0]->{"filename"};
+       $last_stack0_filename=$stack[0]->{"filename"};
+       for my $stacki (0..$#stack) {
+               my $this=$stack[$stacki];
+               my $filename=$this->{"filename"} or confess;
+               my $relname=$this->{"relname"} or confess;
+               (my $basename=$filename)=~s{^.*/}{} or confess;
+               my $expanded=$stack[$#stack]->{"filename"};
+               my $this_name;
+               my $type;
+               if ($stacki==$#stack-1 && $basename eq $basename_orig_) {
+                       $this->{"type"} eq "cabinet" or confess;
+                       $type=$this->{"type"} or confess;
+                       $this_name="$final_name Cabinet";
+                       }
+               elsif ($stacki<$#stack) {
+                       $type=$this->{"type"} or confess;
+                       $this_name="Cabinet";
+                       }
+               else {
+                       $type=$final_type or confess;
+                       $this_name=$final_name;
+                       }
+               my $length=(stat $filename)[7] or confess;
+               my $dist=$stack[0]->{"filename"};
+               $dist=~s{^.*/}{} or confess;
+               my $lang=name_to_lang $basename0;
+               local *F;
+               open F,$expanded or confess;
+               my $buf="";
+               my $buf2;
+               my $ver;
+               while (my $got=read F,$buf2,0x1000) {
+                       confess $expanded if !defined $got;
+                       last if !$got;
+                       confess $expanded if $got<0;
+                       confess $expanded if length($buf2)!=$got;
+                       $buf.=$buf2;
+                       $ver=$buf;
+                       $ver=~s/\x00\x00+/!/g;
+                       $ver=~tr/\x00//d;
+                       $ver=~tr/!/\x00/;
+                       last if $ver=~s/^.*\x00\Q$ver_prefix\E(\d+)[ \x00].*$/$1/s;
+                       $ver=undef();
+                       $buf=substr($buf,-0x100);
+                       }
+               confess $expanded if !$ver;
+               close F or confess;
+               $ver=~/^\d+$/ or confess "$expanded: $ver";
+               my $pri=sprintf "510%04u00",$ver;
+               $pri+=90;
+               $pri-=10 if $lang ne "English";
+               $pri-=20 if $basename_orig=~/^ntkrnlpa/;
+               $pri-=40 if $basename_orig=~/^ntkrnlmp/;
+               $pri-=60 if $basename_orig=~/^ntkrpamp/;
+               local *F;
+               open F,$filename or confess;
+               my $md5obj=Digest::MD5->new();
+               $md5obj->addfile(*F);
+               close F or confess;
+               my $md5sum=lc $md5obj->hexdigest();
+               spawn("cp -p '$filename' '$out/$md5sum'") if !$md5sum_stored{$md5sum}++ && $out && $stacki==$#stack;
+
+               my %new=(
+                       %$this,
+                       "type"=>$type,
+                       "length"=>$length,
+                       "priority"=>$pri,
+                       "md5"=>$md5sum,
+                       "dist"=>$dist,
+                       "lang"=>$lang,
+                       "ver"=>$ver,
+                       "this_name"=>$this_name,
+                       );
+               # Highest priority/version of all the files in the cabinet:
+               for (qw(ver priority)) {
+                       next if !$this->{$_};
+                       $new{$_}=$this->{$_} if $new{$_}<$this->{$_};
+                       delete $this->{$_} if $this->{$_}<$new{$_};
+                       }
+               do { cluck Dumper($this,\%new) if $new{$_} ne $this->{$_}; } for keys(%$this);
+               %$this=%new;
+               }
+       push @stack_out,[@stack];
+}
+
+sub spawn($)
+{
+my($command)=@_;
+
+       my $code=system "($command)".' >&2';
+       confess "$code: $command" if $code;
+}
+
+sub rm_rf($)
+{
+my($dir)=@_;
+
+       return if !-e $dir;
+       # Do not: Can't remove directory xyzzy: Directory not empty
+       #         -r--r--r-- xyzzy
+       #         remove \1,$dir or confess "$dir: $!";
+       spawn "rm -rf '$dir'";
+}
+
+sub mkdir_checked
+{
+my(@dirs)=@_;
+
+       for (@dirs) {
+               mkdir $_ or confess "$_: $!";
+               }
+}
+
+
+our $depth=-1;
+
+sub prep()
+{
+       confess if $dirnew;
+       $dirnew="$tmp/$depth";
+       chdir "/" or confess;
+       rm_rf $dirnew;
+       mkdir_checked $dirnew;
+       chdir $dirnew or confess "chdir: $dirnew";
+       return $dirnew;
+}
+
+sub relname($)
+{
+my($filename)=@_;
+
+       return $filename if $filename=~s{^\Q$tmp\E/\d+/}{};
+       (my $basename=$filename)=~s{^.*/}{} or confess;
+       return $basename;
+}
+
+my %unknown;
+sub process();
+sub process()
+{
+       my $dirname=$dirnew or confess;
+       $dirnew=undef();
+       local $depth=$depth+1;
+       my @stack_orig=@stack;
+
+       local *FIND;
+       open FIND,"find '$dirname' -type f|sort|" or confess $dirname;
+       local $_;
+       while (<FIND>) {
+               next if $_ eq "";
+               if (!$depth) {
+                       print STDERR ".";
+                       $STDERR_needs_eol=1;
+                       }
+               chomp;
+               my $filename=$_;
+               $filename=~m{^/} or confess $filename;
+               (my $basename=$filename)=~s{^.*/}{} or confess;
+               -r $filename or do { cluck $filename; system("ls -l '$filename'"); };
+               my $this={
+                       "filename"=>$filename,
+                       "relname"=>relname($filename),
+                       };
+               @stack=(@stack_orig,$this);
+
+               check $filename,$basename;
+
+               local *MIME;
+               open MIME,"file -b -i '$filename'|" or confess;
+               my $mime=do { local $/; <MIME> or confess; };
+               close MIME or confess;
+               chomp $mime;
+               next if $mime=~m{^text/};
+               next if $mime=~m{^image/};
+               next if $mime=~m{^audio/};
+               next if $mime eq 'application/x-empty';
+               next if $mime eq 'application/msaccess';
+
+               local *ID;
+               open ID,"file -b '$filename'|" or confess;
+               my $id=do { local $/; <ID> or confess; };
+               close ID or confess;
+               chomp $id;
+               next if $id eq 'Microsoft Office Document';
+               next if $id=~/^PDF document, version 1[.]\d+$/;
+               next if $id eq 'Rich Text Format data, version 1,';
+               next if $id=~/^x86 boot sector, /;
+               next if $id eq 'data';
+               next if $id eq 'MZ executable for MS-DOS';
+               next if $id eq 'XML document text';
+               next if $id eq 'MS Windows HtmlHelp Data';
+               next if $id eq 'MPEG sequence';
+               next if $id=~/^MSVC program database ver /;
+               next if $id eq 'Lotus 1-2-3';
+               next if $id eq 'MS Windows Help Data';
+               next if $id=~/^Macromedia Flash data, /;
+               next if $id eq 'Microsoft ASF'; # .wmv
+               next if $id eq 'GLF_BINARY_LSB_FIRST';  # .hlp
+               next if $id=~/^Sendmail frozen configuration /;
+               next if $id eq 'Assembler source';
+               next if $id=~/^DBase 3 data file/;
+               next if $id=~/^Macintosh HFS Extended version /;
+               next if $id=~/^MPEG ADTS, /;    # .txt?,.hlp?
+               next if $id eq 'MS-DOS executable (COM)';
+               next if $id eq 'NE executable for MS Windows 3.x (driver)';
+               next if $id eq 'TrueType font data';
+               next if $id eq 'Windows NT registry file';
+               next if $id eq 'lif file';      # .sig?
+               next if $id=~/^Infocom game data /;     # ._p?
+               next if $id=~/^Sun disk label /;        # .dl_?
+               next if $id=~/^compiled Java class data, /;
+               next if $id=~/^Minix filesystem/;       # .dl_?,_p?
+               next if $id eq 'DBase 3 index file';    # .nls?
+               next if $id=~/^Macintosh MFS data /;    # .cp_?
+               next if $id eq 'ACB archive data';      # .cat?
+               next if $id eq 'COM executable for MS-DOS';     # .edb?
+               next if $id=~/^Bio-Rad .PIC Image File/;        # ._p?
+               next if $id eq 'SysEx File - Gulbransen';       # .edb?
+
+               # Do not:
+               #next if $id=~/^PE executable for MS Windows .* 32-bit$/;       # TODO
+
+               if ($id eq 'PE executable for MS Windows (GUI) Intel 80386 32-bit, InnoSetup self-extracting archive') {
+                       # FIXME: http://innounp.sourceforge.net/
+                       next;
+                       }
+               if (0
+                               || $id=~/^Microsoft Cabinet archive data, \d+ byte(?:s)?, \d+ file(?:s)?$/
+                               || $id eq 'PE executable for MS Windows (GUI) Intel 80386 32-bit, MS CAB-Installer self-extracting archive'
+                               # Catch all unidentified flying cabinets:
+                               || $id=~/^PE executable for MS Windows .* 32-bit$/
+                               ) {
+                       $this->{"type"}="cabinet";
+                       prep();
+                       spawn q{cabextract -q '}.$filename.q{' 2>&1|grep -v ': \(}.join(q{\|},
+                                       q{WARNING; possible [0-9]* extra bytes at end of file.},
+                                       q{no valid cabinets found},
+                                       q{checksum error},
+                                       q{error in CAB data format},
+                                       ).q{\)$';true};
+                       process();
+                       next;
+                       }
+               # non-"32-bit" (amd64 in fact)
+               next if $id=~/^PE executable for MS Windows/;
+               if (0
+                               || $id eq 'PE executable for MS Windows (GUI) Intel 80386 32-bit, ZIP self-extracting archive (WinZip)'
+                               || $id=~/^Zip archive data, /
+                               ) {
+                       prep();
+                       spawn "unzip -q '$filename'";
+                       process();
+                       next;
+                       }
+               if ($id=~m{^gzip compressed data, was "\S+", from Win/32}) {
+                       prep();
+                       spawn "gzip -d <'$filename' >'$basename'";
+                       process();
+                       next;
+                       }
+               if ($id eq 'tar archive') {
+                       prep();
+                       spawn "tar xf '$filename'";
+                       process();
+                       next;
+                       }
+               warn "$filename: $mime - $id" if !$unknown{$mime,$id}++;
+               }
+       close FIND or confess;
+       $dirnew=undef();
+}
+
+
+$dirnew=getcwd()."/".$dirnew if $dirnew!~m{^/};
+$out   =getcwd()."/".$out    if $out && $out!~m{^/};
+rm_rf $tmp;
+mkdir_checked $tmp;
+if ($out) {
+       rm_rf $out;
+       mkdir_checked $out;
+       }
+process;
+stack0_flush() if $last_stack0_filename;
+print STDERR "\n" if $STDERR_needs_eol;
+$STDERR_needs_eol=0;