Inside-equiv sorting implemented
[kewensis.git] / kewensis-collect.pl
index f71d434..c6f399b 100755 (executable)
@@ -1,7 +1,11 @@
-#! /usr/bin/perl -w
+#! /usr/bin/perl
 
-use vars qw/%DB %OWNS %PARENT %CURR $last_field $debugmatch $debugparse $key $owner $parent $ref/;
+use strict;
+use warnings;
 
+use vars qw/%DB %OWNS %PARENT %CURR $last_field $debugmatch $debugparse $key $owner $parent $ref $note_rest $printdupl $maxsize $fileno $fileid $filename/;
+
+$maxsize=0x40000;
 %DB=();
 %CURR=();
 $debugparse=0;
@@ -20,7 +24,7 @@ my( $r )=@_;
 sub rec_to_name
 {
 my( %rec )=@_;
-my( @list )=("Rank","Infrafam.","Genus","Species");
+my( @list )=("Genus","Species");
 my( $item,$r );
 
        $r="";
@@ -30,21 +34,64 @@ my( $item,$r );
        return $r;
 }
 
+sub htmlquote
+{
+($_)=@_;
+
+       s/&/&/g;
+       s/</&lt;/g;
+       s/>/&gt;/g;
+       s/\n/&nl;/g;
+       s/"/&quot;/g;
+       return $_;
+}
+
+sub format_record
+{
+my( $preinsert,$postinsert,%rec )=@_;
+
+       my($r)="<p><span class=\"name\">".$preinsert.&htmlquote(&rec_to_name(%rec)).$postinsert."</span>";
+       $r.="\n".htmlquote($rec{"Publ. Author"}) if (exists($rec{"Publ. Author"}));
+       $r.="<br />\n".htmlquote($rec{"Publication"}) if (exists($rec{"Publication"}));
+       $r.="<br />\n".htmlquote($rec{"Notes"}) if (exists($rec{"Notes"}));
+       $r.="</p>\n";
+       return($r);
+}
+
+sub extract_year
+{
+       ($_)=@_;
+       if (defined($_)) {
+               s/\(([0-9]{4}) publ. [0-9]{4}\)/($1)/g;
+               while (/\(([0-9]{4})(-[0-9]+)?\)/) {
+                       return $1 if ($1>=1700 && $1<=2010);
+                       $_=$';
+                       }
+               }
+       return(-1);
+}
+
 sub flush_CURR
 {
 my( $key );
 
        $key=&name_to_key(&rec_to_name(%CURR));
-       return if (!defined $key);
+       return if (!$key);
+       delete $CURR{"Family"};
        if (exists $DB{$key})
                { print STDERR "Key \"$key\" is duplicated!\n" if $printdupl; }
        else {
                print STDERR "ADDKEY: $key\n" if $debugparse;
                %{$DB{$key}}=%CURR;
+#              my($year)=&extract_year($CURR{"Notes"});
+#              %{$DB{$key}}{"year"}=$year if (defined($year));
                }
        %CURR=();
+       @{$OWNS{$key}}=();
 }
 
+%OWNS=();
+
 while (<>) {
        tr/\r\n//d;
        if (/^ *\304\301\304/) {
@@ -67,7 +114,6 @@ while (<>) {
 &flush_CURR();
 undef %CURR;
 
-%OWNS=();
 %PARENT=();
 
 sub try_reparent
@@ -79,12 +125,16 @@ my( $reparent,$refkey );
        return 0 if ($parent eq $refkey);
        return 0 if !exists $DB{$refkey};
        print STDERR "try_reparent: SUCCESS: key=\"$key\", refkey=\"$refkey\"\n" if $debugmatch;
+       if ($note_rest)
+               { $DB{$key}{"Notes"}=$note_rest; }
+       else
+               { delete $DB{$key}{"Notes"}; }
 
        foreach $reparent (@{$OWNS{$parent}}) {
                $PARENT{$reparent}=$refkey;
                }
-       @{$OWNS{$parent}}=() if (!defined $OWNS{$parent});
-       @{$OWNS{$refkey}}=() if (!defined $OWNS{$refkey});
+       @{$OWNS{$parent}}=() if (!exists $OWNS{$parent});
+       @{$OWNS{$refkey}}=() if (!exists $OWNS{$refkey});
        @{$OWNS{$refkey}}=(@{$OWNS{$refkey}},$parent,@{$OWNS{$parent}});
        delete $OWNS{$parent};
 
@@ -100,7 +150,8 @@ MATCH: foreach $key (keys %DB) {
        $ref=~s/^=//;
        $ref=~s/^O\. *//i;
        $ref=~s/^Orchidaceae *//i;
-       $ref=~s/\..*$//;
+       $ref=~s/\. *(.*)$//;
+       $note_rest=$1;
        next MATCH if &try_reparent();
        $ref=$DB{$key}{"Rank"}." $ref" if exists $DB{$key}{"Rank"};
        next MATCH if &try_reparent();
@@ -109,17 +160,99 @@ MATCH: foreach $key (keys %DB) {
        }
 undef %PARENT;
 
-foreach $owner (keys %OWNS) {
+#foreach $key (keys %DB) {
+#      $DB{$key}{"Publication"}=&extract_year($DB{$key}{"Publication"})." ::: ".$DB{$key}{"Publication"}
+#                      if (exists($DB{$key}{"Publication"}));
+#      }
+
+foreach $key (keys %OWNS) {
+       my(@keys)=@{$OWNS{$key}};
+       delete($OWNS{$key});
+       unshift(@keys,$key);
+       @keys=sort { &extract_year($DB{$b}{"Publication"}) <=> &extract_year($DB{$a}{"Publication"}); } @keys;
+       my($pkey)=shift(@keys);
+       @{$OWNS{$pkey}}=@keys;
+       }
+
+sub print_header
+{
+my($header)=@_;
+
+       print OUT
+"<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>
+<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">
+<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">
+<head><title>Kewensis $header</title>
+<style type=\"text/css\"><!--
+.name { font-weight: bold; }
+--></style>
+</head><body>
+\n";
+}
+
+$fileno=-1;
+
+sub OUT_flush
+{
+       print OUT "</body></html>\n";
+       close(OUT);
+}
+
+my( $filename );
+foreach $owner (sort keys %OWNS) {
 my( $child );
 
+       if ($fileno<0 || tell(OUT)>=$maxsize) {
+               &OUT_flush() if ($fileno>=0);
+               $fileid=sprintf("%04d",++$fileno);
+               $filename="kew-$fileid.html";
+               open(OUT,">$filename") or die "Cannot open \"$filename\": $!";
+               &print_header("chunk $fileid");
+               }
+
        if ($debugmatch) {
                print STDERR "($owner):";
                foreach $child (@{$OWNS{$owner}})
                        { print STDERR " ($child)"; }
                print STDERR "\n";
                }
-       print "(".&rec_to_name(%{$DB{$owner}}).")";
-       foreach $child (@{$OWNS{$owner}})
-               { print " (".&rec_to_name(%{$DB{$child}}).")"; }
-       print "\n";
+       print OUT &format_record("<a id=\"".&htmlquote($owner)."\">","</a>",%{$DB{$owner}});
+       if (@{$OWNS{$owner}}) {
+               print OUT "<blockquote>\n";
+               foreach $child (@{$OWNS{$owner}})
+                       { print OUT &format_record("","",%{$DB{$child}}); }
+               print OUT "</blockquote>\n";
+               }
+       print OUT "\n";
+       $DB{$owner}{"_filename"}=$filename;
+       }
+&OUT_flush();
+open(OUT,">kew-index.html") or die "Cannot open \"kew-index.html\": $!";
+&print_header("Index");
+
+sub print_href
+{
+my($owner)=@_;
+my(%rec)=%{$DB{$owner}};
+
+       print OUT &htmlquote(&rec_to_name(%rec));
+}
+
+foreach $owner (sort keys %OWNS) {
+my( $child );
+
+       print OUT "<p><a href=\"".$DB{$owner}{"_filename"}."#$owner\">";
+       &print_href($owner);
+       print OUT "</a></p>";
+       if (@{$OWNS{$owner}}) {
+               print OUT "<ul>\n";
+               foreach $child (@{$OWNS{$owner}}) {
+                       print OUT "<li>";
+                       &print_href($child);
+                       print OUT "</li>\n";
+                       }
+               print OUT "</ul>";
+               }
+       print OUT "\n";
        }
+OUT_flush();