Parts rewritten to parse ipni.org data
authorshort <>
Sat, 8 Dec 2001 04:31:49 +0000 (04:31 +0000)
committershort <>
Sat, 8 Dec 2001 04:31:49 +0000 (04:31 +0000)
kewensis-collect.pl

index c6f399b..aba360d 100755 (executable)
@@ -3,14 +3,17 @@
 use strict;
 use warnings;
 
 use strict;
 use warnings;
 
-use vars qw/%DB %OWNS %PARENT %CURR $last_field $debugmatch $debugparse $key $owner $parent $ref $note_rest $printdupl $maxsize $fileno $fileid $filename/;
+use Cwd;
+use Data::Dumper;
 
 
-$maxsize=0x40000;
+my(%DB,%OWNS,$debugmatch,$D,$key,$owner,$ref,$maxsize,$filename,$doimport,$import_xlate);
+
+# $maxsize=0x40000;
 %DB=();
 %DB=();
-%CURR=();
-$debugparse=0;
-$printdupl=0;
+$D=0;
 $debugmatch=0;
 $debugmatch=0;
+$doimport=1;
+$import_xlate=1;
 
 sub name_to_key
 {
 
 sub name_to_key
 {
@@ -24,25 +27,21 @@ my( $r )=@_;
 sub rec_to_name
 {
 my( %rec )=@_;
 sub rec_to_name
 {
 my( %rec )=@_;
-my( @list )=("Genus","Species");
-my( $item,$r );
 
 
-       $r="";
-       while ($item=shift @list)
-               { $r.=" ".$rec{$item} if exists $rec{$item}; }
-       $r=~s/^ //;
-       return $r;
+       return $rec{"name"};
 }
 
 sub htmlquote
 {
 }
 
 sub htmlquote
 {
-($_)=@_;
+my($class);
+($_,$class)=@_;
 
        s/&/&amp;/g;
        s/</&lt;/g;
        s/>/&gt;/g;
        s/\n/&nl;/g;
        s/"/&quot;/g;
 
        s/&/&amp;/g;
        s/</&lt;/g;
        s/>/&gt;/g;
        s/\n/&nl;/g;
        s/"/&quot;/g;
+       return "<span class=\"$class\">$_</span>" if defined $class;
        return $_;
 }
 
        return $_;
 }
 
@@ -50,12 +49,15 @@ sub format_record
 {
 my( $preinsert,$postinsert,%rec )=@_;
 
 {
 my( $preinsert,$postinsert,%rec )=@_;
 
-       my($r)="<p><span class=\"name\">".$preinsert.&htmlquote(&rec_to_name(%rec)).$postinsert."</span>";
-       $r.="\n".htmlquote($rec{"Publ. Author"}) if (exists($rec{"Publ. Author"}));
-       $r.="<br />\n".htmlquote($rec{"Publication"}) if (exists($rec{"Publication"}));
-       $r.="<br />\n".htmlquote($rec{"Notes"}) if (exists($rec{"Notes"}));
-       $r.="</p>\n";
-       return($r);
+       my($r)=$preinsert.htmlquote(rec_to_name(%rec),"name").$postinsert;
+       $r.="\n".htmlquote($rec{"Publ. Author"},"author") if (exists($rec{"Publ. Author"}));
+       $r.="<br />\n".htmlquote($rec{"Publication"},"publication") if (exists($rec{"Publication"}));
+       $r.="<br />\n".htmlquote($rec{"Notes"},"notes") if (exists($rec{"Notes"}));
+       if ($doimport && exists($rec{"html"})) {
+               my($import)="import: [".$rec{"id"}."]";
+               $r.="<br />\n<blockquote><!-- BEGIN $import -->\n".$rec{"html"}."\n<!-- END $import --></blockquote>\n";
+               }
+       return $r;
 }
 
 sub extract_year
 }
 
 sub extract_year
@@ -68,100 +70,154 @@ sub extract_year
                        $_=$';
                        }
                }
                        $_=$';
                        }
                }
-       return(-1);
+       return -1;
 }
 
 }
 
-sub flush_CURR
+sub failed
 {
 {
-my( $key );
-
-       $key=&name_to_key(&rec_to_name(%CURR));
-       return if (!$key);
-       delete $CURR{"Family"};
-       if (exists $DB{$key})
-               { print STDERR "Key \"$key\" is duplicated!\n" if $printdupl; }
-       else {
-               print STDERR "ADDKEY: $key\n" if $debugparse;
-               %{$DB{$key}}=%CURR;
-#              my($year)=&extract_year($CURR{"Notes"});
-#              %{$DB{$key}}{"year"}=$year if (defined($year));
-               }
-       %CURR=();
-       @{$OWNS{$key}}=();
+my($file)=@_;
+
+       print("-$file----------------$'--------------\n");
+       warn "Unable to match file \"".getcwd()."/$file\".";
 }
 
 }
 
-%OWNS=();
+sub process_file
+{
+my($file)=@_;
 
 
-while (<>) {
-       tr/\r\n//d;
-       if (/^ *\304\301\304/) {
-               &flush_CURR();
-               print STDERR "---\n" if $debugparse;
+       if (!open(FI,$file)) {
+               warn("Unable to open file \"$file\" in dir \"".getcwd()."\": $!");
+               return;
                }
                }
-       elsif (/^([^³]*[^ ]) *³ *(.*)$/) {
-               print STDERR "FIELD: $1, VALUE: $2\n" if $debugparse;
-               $last_field=$1;
-               $CURR{$1}=$2;
+       undef $/;
+       my($fi)=<FI>;
+       close(FI);
+       my($word)='[^<]*';
+  my($bigword)='[^<]*(?:<IT>)?[^<]*(?:<RO>)?[^<]*(?:<1000 m)?[^<]*';
+       my($idone)='[-\d]';
+       my($id)="$idone+";
+       my($any)='[\x00-\xFF]*';
+       my($ipniservletword)="<a href=\"\\./IpniServlet\\?id=($id)&query_type=by_id\">($word)</a>";
+       my($idquoted)=$id; $idquoted=~s%\W%\\$&%g;
+       my($ipniservletwordthree)=$ipniservletword; $ipniservletwordthree=~s%\($idquoted\)%((?:$idone){3})$&%os or die;
+       my($attrpat)="<p>nomenclatural synonym(\\(Main Record\\))?:$ipniservletword</p>|<p>basionym(\\(\\d+\\))?:$ipniservletword</p>|<p>basionym:($word)|<p>replaced synonym:$ipniservletword</p>|<p>replaced synonym:($word)|<p>Is a replaced synonym of:$ipniservletword</p>|<p>Is a basionym of:$ipniservletword</p>|<p>later publication of(\\(\\d+\\))?:$ipniservletword</p>|<p>Is a later publication of of:$ipniservletword</p>";
+       $|=1;
+       "<undef>"=~/^/;
+       if ($fi!~m%^<html>
+<head>
+<title>IPNI Query Results</title>
+</head>
+<body bgcolor="#ffffff" text="#000000" link="#006666" vlink="#008080" alink="#008080">
+<HR><b><i>Orchidaceae</i> ($word)</b> ($word) <br>
+((?:<a href="\./PublicationServlet\?id=($id)&query_type=by_id"> ($word)</a> ($word)|$bigword(?:<br>\n$word)*)?<p>($bigword)?</p><p>($word)?</p>(?:
+remarks: .*)?(<HR><h4>Type</h4>)?(<table $any</table>)?(?:
+<h4>Linked Records</h2>
+((?:$attrpat)*))?(<br>
+
+<h4>Original Data</h2>
+(?:basionym: ($word)<br>
+)?(?:hybrid parentage: ($word)<br>
+)?(?:replaced synonym: ($word)<br>
+)?(?:distribution: ($word)<br>
+)?(?:Notes: ($word))?)?)(?:<p><a href="\./query_ipni.html">Back to Search Page</a></p>
+</body>
+</html>)$%os) {
+               failed($file);
+               return;
                }
                }
-       elsif (/^ *³ *(.*)$/) {
-               print STDERR "APPEND: $1\n" if $debugparse;
-               $CURR{$last_field}.=" ".$1;
+       my(%rec);
+       $rec{"name"}=$1;
+       $rec{"Publ. Author"}=$2;
+       $rec{"Publication"}="$5 $6" if defined($5) && defined($6);
+       $rec{"html"}=$3 if defined $3;
+       my($attrsbody)=$11;
+       ($rec{"id"}=$file)=~s#^($id)\.html$#$1#os or failed($file);
+       $rec{"html"}=~s#$ipniservletwordthree#<a href="$1/$1$2.html">$3</a>#osg if $import_xlate && exists $rec{"html"};
+       while (defined($attrsbody) && $attrsbody=~s%^(?:$attrpat)%%os) {
+               # nomenclatural synonym: id=$2
+               # basionym: id=$5
+               # replaced synonym: id=$8
+               # Is a replaced synonym: id=$11
+               # Is a basionym: id=$13
+               # later publication: id=$16
+               # Is a later publication: id=$18
+               my(@refs)=($2,$5,$8,$11,$13,$16,$18);
+               $rec{"refs"}=[];
+               while (@refs) {
+                       push(@{$rec{"refs"}},$_) if defined ($_=shift @refs);
+                       }
                }
                }
-       else {
-               print STDERR "DISCARD: $_\n" if $debugparse;
+       if ($attrsbody) {
+               failed($file);
+               return;
                }
                }
-       }
-&flush_CURR();
-undef %CURR;
+       $DB{$rec{"id"}}=\%rec;
+}
 
 
-%PARENT=();
 
 
-sub try_reparent
+sub process_dir
 {
 {
-my( $reparent,$refkey );
-
-       $refkey=&name_to_key($ref);
-       return 0 if ($refkey eq "");
-       return 0 if ($parent eq $refkey);
-       return 0 if !exists $DB{$refkey};
-       print STDERR "try_reparent: SUCCESS: key=\"$key\", refkey=\"$refkey\"\n" if $debugmatch;
-       if ($note_rest)
-               { $DB{$key}{"Notes"}=$note_rest; }
-       else
-               { delete $DB{$key}{"Notes"}; }
-
-       foreach $reparent (@{$OWNS{$parent}}) {
-               $PARENT{$reparent}=$refkey;
+my( $dir )=@_;
+my( $old )=getcwd();
+
+       if (!chdir($dir)) {
+               warn("Unable to change to dir \"$dir\" from dir \"$old\": $!");
+               return;
+               }
+       opendir(DIR,".") or die("Cannot open . in \"".getcwd()."\": $!");
+       foreach (sort readdir(DIR)) {
+               process_entry($_) if ($_!~/^\./);
                }
                }
-       @{$OWNS{$parent}}=() if (!exists $OWNS{$parent});
-       @{$OWNS{$refkey}}=() if (!exists $OWNS{$refkey});
-       @{$OWNS{$refkey}}=(@{$OWNS{$refkey}},$parent,@{$OWNS{$parent}});
-       delete $OWNS{$parent};
+       closedir(DIR);
+       chdir($old) or warn("Unable to retreat to dir \"$old\": $!");
+}
+
+sub process_entry
+{
+my( $entry )=@_;
 
 
-       return 1;
+       process_dir ($entry) if -d $entry;
+       process_file($entry) if -f $entry;
 }
 
 }
 
-MATCH: foreach $key (keys %DB) {
-       $parent=$PARENT{$key};
-       $parent=$key if (!defined $parent); 
-       $ref=$DB{$key}{"Notes"};
-       next MATCH if (!defined $ref);
-       $ref=~tr/()//d;
-       $ref=~s/^=//;
-       $ref=~s/^O\. *//i;
-       $ref=~s/^Orchidaceae *//i;
-       $ref=~s/\. *(.*)$//;
-       $note_rest=$1;
-       next MATCH if &try_reparent();
-       $ref=$DB{$key}{"Rank"}." $ref" if exists $DB{$key}{"Rank"};
-       next MATCH if &try_reparent();
-       $ref=$DB{$key}{"Genus"}." $ref" if exists $DB{$key}{"Genus"};
-       next MATCH if &try_reparent();
+%OWNS=();
+
+foreach (@ARGV)
+       { process_entry($_); }
+
+my($id);
+for $id (keys %DB) {
+       my($refid);
+       my(@refs);
+       for $refid (@{$DB{$id}{"refs"}}) {
+               if (!exists $DB{$refid}) {
+                       warn "Undefined ref id \"$refid\" from id \"$id\"" if $D;
+                       next;
+                       }
+               next if $id eq $refid;  # self-ref
+#              push(@refs,$DB{$refid});
+               push(@refs,$refid);
+               }
+       $DB{$id}{"refs"}=\@refs;
        }
        }
-undef %PARENT;
+
+print Data::Dumper->Dump([\%DB],["%DB"]) if $D;
+
+%OWNS=map { $_=>[] } keys(%DB);
+
+for $id (keys %OWNS) {
+       my($refid);
+       for $refid (@{$DB{$id}{"refs"}}) {
+               next if !exists $OWNS{$refid};
+               push(@{$OWNS{$id}},$refid,@{$OWNS{$refid}});
+               delete $OWNS{$refid};
+               }
+       }
+
+print Data::Dumper->Dump([\%OWNS],["%OWNS"]) if $D;
 
 #foreach $key (keys %DB) {
 
 #foreach $key (keys %DB) {
-#      $DB{$key}{"Publication"}=&extract_year($DB{$key}{"Publication"})." ::: ".$DB{$key}{"Publication"}
+#      $DB{$key}{"Publication"}=extract_year($DB{$key}{"Publication"})." ::: ".$DB{$key}{"Publication"}
 #                      if (exists($DB{$key}{"Publication"}));
 #      }
 
 #                      if (exists($DB{$key}{"Publication"}));
 #      }
 
@@ -169,9 +225,9 @@ foreach $key (keys %OWNS) {
        my(@keys)=@{$OWNS{$key}};
        delete($OWNS{$key});
        unshift(@keys,$key);
        my(@keys)=@{$OWNS{$key}};
        delete($OWNS{$key});
        unshift(@keys,$key);
-       @keys=sort { &extract_year($DB{$b}{"Publication"}) <=> &extract_year($DB{$a}{"Publication"}); } @keys;
+       @keys=sort { extract_year($DB{$b}{"Publication"}) <=> extract_year($DB{$a}{"Publication"}); } @keys;
        my($pkey)=shift(@keys);
        my($pkey)=shift(@keys);
-       @{$OWNS{$pkey}}=@keys;
+       $OWNS{$pkey}=\@keys;
        }
 
 sub print_header
        }
 
 sub print_header
@@ -179,76 +235,89 @@ sub print_header
 my($header)=@_;
 
        print OUT
 my($header)=@_;
 
        print OUT
-"<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>
+"<?xml version=\"1.0\" encoding=\"utf-8\"?>
 <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">
 <html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">
 <head><title>Kewensis $header</title>
 <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">
 <html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">
 <head><title>Kewensis $header</title>
+<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />
+<base href=\"".($import_xlate ? "data/" : "http://www.ipni.org/ipni/")."\" />
 <style type=\"text/css\"><!--
 .name { font-weight: bold; }
 <style type=\"text/css\"><!--
 .name { font-weight: bold; }
+.author { font-variant: small-caps; }
+.publication { }
+.notes { }
 --></style>
 </head><body>
 \n";
 }
 
 --></style>
 </head><body>
 \n";
 }
 
-$fileno=-1;
-
 sub OUT_flush
 {
        print OUT "</body></html>\n";
        close(OUT);
 }
 
 sub OUT_flush
 {
        print OUT "</body></html>\n";
        close(OUT);
 }
 
-my( $filename );
-foreach $owner (sort keys %OWNS) {
-my( $child );
-
-       if ($fileno<0 || tell(OUT)>=$maxsize) {
-               &OUT_flush() if ($fileno>=0);
-               $fileid=sprintf("%04d",++$fileno);
-               $filename="kew-$fileid.html";
-               open(OUT,">$filename") or die "Cannot open \"$filename\": $!";
-               &print_header("chunk $fileid");
-               }
+if (defined $maxsize) {
+       my($fileno)=-1;
+       my($filename,$fileid);
+       foreach $owner (sort keys %OWNS) {
+       my( $child );
+
+               if ($fileno<0 || tell(OUT)>=$maxsize) {
+                       OUT_flush() if ($fileno>=0);
+                       $fileid=sprintf("%04d",++$fileno);
+                       $filename="kew-$fileid.html";
+                       open(OUT,">$filename") or die "Cannot open \"$filename\": $!";
+                       print_header("chunk $fileid");
+                       }
 
 
-       if ($debugmatch) {
-               print STDERR "($owner):";
-               foreach $child (@{$OWNS{$owner}})
-                       { print STDERR " ($child)"; }
-               print STDERR "\n";
-               }
-       print OUT &format_record("<a id=\"".&htmlquote($owner)."\">","</a>",%{$DB{$owner}});
-       if (@{$OWNS{$owner}}) {
-               print OUT "<blockquote>\n";
-               foreach $child (@{$OWNS{$owner}})
-                       { print OUT &format_record("","",%{$DB{$child}}); }
-               print OUT "</blockquote>\n";
+               if ($debugmatch) {
+                       print STDERR "($owner):";
+                       foreach $child (@{$OWNS{$owner}})
+                               { print STDERR " ($child)"; }
+                       print STDERR "\n";
+                       }
+               print OUT format_record("<p><a id=\"".htmlquote($owner)."\">","</a></p>\n",%{$DB{$owner}});
+               if (@{$OWNS{$owner}}) {
+                       print OUT "<blockquote>\n";
+                       foreach $child (@{$OWNS{$owner}})
+                               { print OUT format_record("<p>","</p>\n",%{$DB{$child}}); }
+                       print OUT "</blockquote>\n";
+                       }
+               print OUT "\n";
+               $DB{$owner}{"_filename"}=$filename;
                }
                }
-       print OUT "\n";
-       $DB{$owner}{"_filename"}=$filename;
+       OUT_flush();
+       open(OUT,">kew-index.html") or die "Cannot open \"kew-index.html\": $!";
+       print_header("Index");
+       }
+else {
+       open(OUT,">kewensis.html") or die "Cannot open \"kewensis.html\": $!";
+       print_header("Full");
        }
        }
-&OUT_flush();
-open(OUT,">kew-index.html") or die "Cannot open \"kew-index.html\": $!";
-&print_header("Index");
 
 
-sub print_href
+sub format_href
 {
 {
-my($owner)=@_;
-my(%rec)=%{$DB{$owner}};
+my($preinsert,$postinsert,%rec)=@_;
 
 
-       print OUT &htmlquote(&rec_to_name(%rec));
+       return htmlquote(rec_to_name(%rec));
 }
 
 }
 
+my($printrecref)=(defined $maxsize ? \&format_href : \&format_record );
+
 foreach $owner (sort keys %OWNS) {
 my( $child );
 
 foreach $owner (sort keys %OWNS) {
 my( $child );
 
-       print OUT "<p><a href=\"".$DB{$owner}{"_filename"}."#$owner\">";
-       &print_href($owner);
-       print OUT "</a></p>";
+       print OUT "<p>";
+       print OUT "<a href=\"".$DB{$owner}{"_filename"}."#$owner\">" if defined $maxsize;
+       print OUT &{$printrecref}("","",%{$DB{$owner}});
+       print OUT "</a>" if defined $maxsize;
+       print OUT "</p>";
        if (@{$OWNS{$owner}}) {
                print OUT "<ul>\n";
                foreach $child (@{$OWNS{$owner}}) {
                        print OUT "<li>";
        if (@{$OWNS{$owner}}) {
                print OUT "<ul>\n";
                foreach $child (@{$OWNS{$owner}}) {
                        print OUT "<li>";
-                       &print_href($child);
+                       print OUT &{$printrecref}("","",%{$DB{$child}});
                        print OUT "</li>\n";
                        }
                print OUT "</ul>";
                        print OUT "</li>\n";
                        }
                print OUT "</ul>";