Update.

[kewensis.git] / kewensis-collect.pl
diff --git a/kewensis-collect.pl b/kewensis-collect.pl

index aba360d..61b6d88 100755 (executable)
--- a/kewensis-collect.pl
+++ b/kewensis-collect.pl
@@ -5,61 +5,16 @@ use warnings;
  
  use Cwd;
  use Data::Dumper;
+use DBI;
  
-my(%DB,%OWNS,$debugmatch,$D,$key,$owner,$ref,$maxsize,$filename,$doimport,$import_xlate);
+my(%DB,$debugmatch,$D,$key,$owner,$ref,$filename,$doimport,$import_xlate);
  
-# $maxsize=0x40000;
  %DB=();
-$D=0;
+$D=1;
  $debugmatch=0;
  $doimport=1;
  $import_xlate=1;
  
-sub name_to_key
-{
-my( $r )=@_;
-
-       $r=~tr/A-Z/a-z/;
-       $r=~tr/a-z0-9//cd;
-       return $r;
-}
-
-sub rec_to_name
-{
-my( %rec )=@_;
-
-       return $rec{"name"};
-}
-
-sub htmlquote
-{
-my($class);
-($_,$class)=@_;
-
-       s/&/&amp;/g;
-       s/</&lt;/g;
-       s/>/&gt;/g;
-       s/\n/&nl;/g;
-       s/"/&quot;/g;
-       return "<span class=\"$class\">$_</span>" if defined $class;
-       return $_;
-}
-
-sub format_record
-{
-my( $preinsert,$postinsert,%rec )=@_;
-
-       my($r)=$preinsert.htmlquote(rec_to_name(%rec),"name").$postinsert;
-       $r.="\n".htmlquote($rec{"Publ. Author"},"author") if (exists($rec{"Publ. Author"}));
-       $r.="<br />\n".htmlquote($rec{"Publication"},"publication") if (exists($rec{"Publication"}));
-       $r.="<br />\n".htmlquote($rec{"Notes"},"notes") if (exists($rec{"Notes"}));
-       if ($doimport && exists($rec{"html"})) {
-               my($import)="import: [".$rec{"id"}."]";
-               $r.="<br />\n<blockquote><!-- BEGIN $import -->\n".$rec{"html"}."\n<!-- END $import --></blockquote>\n";
-               }
-       return $r;
-}
-
  sub extract_year
  {
         ($_)=@_;
@@ -109,7 +64,7 @@ my($file)=@_;
  </head>
  <body bgcolor="#ffffff" text="#000000" link="#006666" vlink="#008080" alink="#008080">
  <HR><b><i>Orchidaceae</i> ($word)</b> ($word) <br>
-((?:<a href="\./PublicationServlet\?id=($id)&query_type=by_id"> ($word)</a> ($word)|$bigword(?:<br>\n$word)*)?<p>($bigword)?</p><p>($word)?</p>(?:
+(?:<a href="\./PublicationServlet\?id=($id)&query_type=by_id"> ($word)</a> ($word)|($bigword(?:<br>\n$word)*))?(<p>($bigword)?</p><p>($word)?</p>(?:
  remarks: .*)?(<HR><h4>Type</h4>)?(<table $any</table>)?(?:
  <h4>Linked Records</h2>
  ((?:$attrpat)*))?(<br>
@@ -128,11 +83,18 @@ remarks: .*)?(<HR><h4>Type</h4>)?(<table $any</table>)?(?:
         my(%rec);
         $rec{"name"}=$1;
         $rec{"Publ. Author"}=$2;
-       $rec{"Publication"}="$5 $6" if defined($5) && defined($6);
-       $rec{"html"}=$3 if defined $3;
-       my($attrsbody)=$11;
+       my $publ="";
+       $publ.="$4 $5" if defined($4) && defined($5);
+       $publ.=$6 if defined $6;
+       $rec{"html"}=$7 if defined $7;
+       my($attrsbody)=$12;
+       # catch-array destroyed here!
+       $publ.=s/\<br\>//g;
+       $rec{"Publication"}=$publ if $publ ne "";
         ($rec{"id"}=$file)=~s#^($id)\.html$#$1#os or failed($file);
         $rec{"html"}=~s#$ipniservletwordthree#<a href="$1/$1$2.html">$3</a>#osg if $import_xlate && exists $rec{"html"};
+       $rec{"refs"}=[];
+       my $score=0; # -: upper, +: lower
         while (defined($attrsbody) && $attrsbody=~s%^(?:$attrpat)%%os) {
                 # nomenclatural synonym: id=$2
                 # basionym: id=$5
@@ -142,11 +104,16 @@ remarks: .*)?(<HR><h4>Type</h4>)?(<table $any</table>)?(?:
                 # later publication: id=$16
                 # Is a later publication: id=$18
                 my(@refs)=($2,$5,$8,$11,$13,$16,$18);
-               $rec{"refs"}=[];
                 while (@refs) {
                         push(@{$rec{"refs"}},$_) if defined ($_=shift @refs);
                         }
+               $score+=+10 if defined  $5; # basionym: id=$5
+               $score+=-10 if defined $13; # Is a basionym: id=$13
+               $score+=- 4 if defined $16; # later publication: id=$16
+               $score+=+ 4 if defined $18; # Is a later publication: id=$18
                 }
+       $score+=+9 if $rec{"Publ. Author"}=~/^\s*\(/;
+       $rec{"score"}=$score;
         if ($attrsbody) {
                 failed($file);
                 return;
@@ -180,18 +147,15 @@ my( $entry )=@_;
         process_file($entry) if -f $entry;
  }
  
-%OWNS=();
-
  foreach (@ARGV)
         { process_entry($_); }
  
-my($id);
-for $id (keys %DB) {
+for my $id (keys %DB) {
         my($refid);
         my(@refs);
         for $refid (@{$DB{$id}{"refs"}}) {
                 if (!exists $DB{$refid}) {
-                       warn "Undefined ref id \"$refid\" from id \"$id\"" if $D;
+                       print "Undefined ref id \"$refid\" from id \"$id\"\n" if $D>=1;
                         next;
                         }
                 next if $id eq $refid;  # self-ref
@@ -201,127 +165,95 @@ for $id (keys %DB) {
         $DB{$id}{"refs"}=\@refs;
         }
  
-print Data::Dumper->Dump([\%DB],["%DB"]) if $D;
+print Data::Dumper->Dump([\%DB],["%DB"]) if $D>=2;
  
-%OWNS=map { $_=>[] } keys(%DB);
+my %OWNS=map { $_=>$DB{$_}{"refs"}; } keys(%DB);
+my %NAMES=();
  
-for $id (keys %OWNS) {
-       my($refid);
-       for $refid (@{$DB{$id}{"refs"}}) {
+for my $id (keys %DB) {
+       $NAMES{$DB{$id}{"name"}}=[] if !exists $NAMES{$DB{$id}{"name"}};
+       push @{$NAMES{$DB{$id}{"name"}}},$id;
+       }
+
+for my $id (keys %DB) {
+       next if !exists $OWNS{$id};
+       my @queue=($id,@{$OWNS{$id}});
+       $OWNS{$id}=[];
+       while (my $refid=shift @queue) {
+               if (exists $NAMES{$DB{$refid}{"name"}}) {
+                       push(@queue,@{$NAMES{$DB{$refid}{"name"}}});
+                       delete $NAMES{$DB{$refid}{"name"}};
+                       }
+               next if $refid eq $id;
                 next if !exists $OWNS{$refid};
-               push(@{$OWNS{$id}},$refid,@{$OWNS{$refid}});
+               push @queue,@{$OWNS{$refid}};
                 delete $OWNS{$refid};
+               print "processed connect id=$id <- refid=$refid\n" if $D>=1;
+               push @{$OWNS{$id}},$refid;
                 }
         }
  
-print Data::Dumper->Dump([\%OWNS],["%OWNS"]) if $D;
-
-#foreach $key (keys %DB) {
-#      $DB{$key}{"Publication"}=extract_year($DB{$key}{"Publication"})." ::: ".$DB{$key}{"Publication"}
-#                      if (exists($DB{$key}{"Publication"}));
-#      }
+print Data::Dumper->Dump([\%OWNS],["%OWNS"]) if $D>=2;
  
  foreach $key (keys %OWNS) {
         my(@keys)=@{$OWNS{$key}};
         delete($OWNS{$key});
         unshift(@keys,$key);
-       @keys=sort { extract_year($DB{$b}{"Publication"}) <=> extract_year($DB{$a}{"Publication"}); } @keys;
+#      @keys=sort { $DB{$a}{"score"} <=> $DB{$b}{"score"}
+#                      || extract_year($DB{$b}{"Publication"}) <=> extract_year($DB{$a}{"Publication"}); } @keys;
+       @keys=sort { $DB{$a}{"name"} cmp $DB{$b}{"name"}; } @keys;
         my($pkey)=shift(@keys);
         $OWNS{$pkey}=\@keys;
         }
  
-sub print_header
-{
-my($header)=@_;
-
-       print OUT
-"<?xml version=\"1.0\" encoding=\"utf-8\"?>
-<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">
-<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">
-<head><title>Kewensis $header</title>
-<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />
-<base href=\"".($import_xlate ? "data/" : "http://www.ipni.org/ipni/")."\" />
-<style type=\"text/css\"><!--
-.name { font-weight: bold; }
-.author { font-variant: small-caps; }
-.publication { }
-.notes { }
---></style>
-</head><body>
-\n";
-}
+my($db_driver,$db_host,$db_user,$db_pwd,$DB_PWD,$db_name,$db);
+my($tb_tree);
  
-sub OUT_flush
-{
-       print OUT "</body></html>\n";
-       close(OUT);
-}
+$db_driver="mysql";
+$db_host="";
+$db_user="short";
+#$DB_PWD=$ENV{"HOME"}."/priv/mysql.${db_user}.pwd";
+$db_pwd="short";
+$db_name="short";
+$tb_tree="kewensis_tree";
  
-if (defined $maxsize) {
-       my($fileno)=-1;
-       my($filename,$fileid);
-       foreach $owner (sort keys %OWNS) {
-       my( $child );
-
-               if ($fileno<0 || tell(OUT)>=$maxsize) {
-                       OUT_flush() if ($fileno>=0);
-                       $fileid=sprintf("%04d",++$fileno);
-                       $filename="kew-$fileid.html";
-                       open(OUT,">$filename") or die "Cannot open \"$filename\": $!";
-                       print_header("chunk $fileid");
-                       }
-
-               if ($debugmatch) {
-                       print STDERR "($owner):";
-                       foreach $child (@{$OWNS{$owner}})
-                               { print STDERR " ($child)"; }
-                       print STDERR "\n";
-                       }
-               print OUT format_record("<p><a id=\"".htmlquote($owner)."\">","</a></p>\n",%{$DB{$owner}});
-               if (@{$OWNS{$owner}}) {
-                       print OUT "<blockquote>\n";
-                       foreach $child (@{$OWNS{$owner}})
-                               { print OUT format_record("<p>","</p>\n",%{$DB{$child}}); }
-                       print OUT "</blockquote>\n";
-                       }
-               print OUT "\n";
-               $DB{$owner}{"_filename"}=$filename;
-               }
-       OUT_flush();
-       open(OUT,">kew-index.html") or die "Cannot open \"kew-index.html\": $!";
-       print_header("Index");
-       }
-else {
-       open(OUT,">kewensis.html") or die "Cannot open \"kewensis.html\": $!";
-       print_header("Full");
-       }
+$db=DBI->connect("DBI:$db_driver:database=$db_name;host=$db_host",$db_user,$db_pwd) or die "Database open fail: $!";
  
-sub format_href
+sub db_do
  {
-my($preinsert,$postinsert,%rec)=@_;
+my( $cmd )=@_;
  
-       return htmlquote(rec_to_name(%rec));
+       $db->do($cmd) or die("SQL command \"$cmd\" failed: $!");
  }
  
-my($printrecref)=(defined $maxsize ? \&format_href : \&format_record );
-
-foreach $owner (sort keys %OWNS) {
-my( $child );
-
-       print OUT "<p>";
-       print OUT "<a href=\"".$DB{$owner}{"_filename"}."#$owner\">" if defined $maxsize;
-       print OUT &{$printrecref}("","",%{$DB{$owner}});
-       print OUT "</a>" if defined $maxsize;
-       print OUT "</p>";
-       if (@{$OWNS{$owner}}) {
-               print OUT "<ul>\n";
-               foreach $child (@{$OWNS{$owner}}) {
-                       print OUT "<li>";
-                       print OUT &{$printrecref}("","",%{$DB{$child}});
-                       print OUT "</li>\n";
-                       }
-               print OUT "</ul>";
+eval { &db_do("drop table $tb_tree") };
+
+&db_do("create table $tb_tree ("
+               ."id char(10) not null,"
+               ."family_id char(10) not null,"
+               ."family_order int not null,"
+               ."name varchar(100) not null,"
+               ."PublAuthor text null,"
+               ."Publication text null,"
+               ."html text null"
+               .")");
+
+&db_do("alter table $tb_tree add unique (id)");
+&db_do("alter table $tb_tree add index (name)");
+&db_do("alter table $tb_tree add unique (family_id,family_order)");
+
+my $insert_tb_tree=$db->prepare("insert into $tb_tree (id,family_id,family_order,name,PublAuthor,Publication,html) values (?,?,?,?,?,?,?)")
+               or die "Prepare fail: $!";
+
+foreach $owner (sort { $DB{$b}{"name"} cmp $DB{$a}{"name"}; } keys %OWNS) {
+       my @family=@{$OWNS{$owner}};
+       unshift(@family,$owner);
+       my $family_id=$DB{$owner}{"id"};
+       for my $family_order (0..$#family) {
+               my $id=$family[$family_order];
+               print "insert:$id,".$DB{$id}{"name"}."\n" if $D>=2;
+               $insert_tb_tree->execute($id,$family_id,$family_order,
+                               $DB{$id}{"name"},$DB{$id}{"Publ. Author"},$DB{$id}{"Publication"},$DB{$id}{"html"}
+                               ) or die "SQL insert failure: $!";
                 }
-       print OUT "\n";
         }
-OUT_flush();