Version developed during Xmas, not tested
[kewensis.git] / kewensis-collect.pl
index 009be1e..61b6d88 100755 (executable)
@@ -7,10 +7,10 @@ use Cwd;
 use Data::Dumper;
 use DBI;
 
-my(%DB,%OWNS,$debugmatch,$D,$key,$owner,$ref,$filename,$doimport,$import_xlate);
+my(%DB,$debugmatch,$D,$key,$owner,$ref,$filename,$doimport,$import_xlate);
 
 %DB=();
-$D=0;
+$D=1;
 $debugmatch=0;
 $doimport=1;
 $import_xlate=1;
@@ -64,7 +64,7 @@ my($file)=@_;
 </head>
 <body bgcolor="#ffffff" text="#000000" link="#006666" vlink="#008080" alink="#008080">
 <HR><b><i>Orchidaceae</i> ($word)</b> ($word) <br>
-((?:<a href="\./PublicationServlet\?id=($id)&query_type=by_id"> ($word)</a> ($word)|$bigword(?:<br>\n$word)*)?<p>($bigword)?</p><p>($word)?</p>(?:
+(?:<a href="\./PublicationServlet\?id=($id)&query_type=by_id"> ($word)</a> ($word)|($bigword(?:<br>\n$word)*))?(<p>($bigword)?</p><p>($word)?</p>(?:
 remarks: .*)?(<HR><h4>Type</h4>)?(<table $any</table>)?(?:
 <h4>Linked Records</h2>
 ((?:$attrpat)*))?(<br>
@@ -83,11 +83,17 @@ remarks: .*)?(<HR><h4>Type</h4>)?(<table $any</table>)?(?:
        my(%rec);
        $rec{"name"}=$1;
        $rec{"Publ. Author"}=$2;
-       $rec{"Publication"}="$5 $6" if defined($5) && defined($6);
-       $rec{"html"}=$3 if defined $3;
-       my($attrsbody)=$11;
+       my $publ="";
+       $publ.="$4 $5" if defined($4) && defined($5);
+       $publ.=$6 if defined $6;
+       $rec{"html"}=$7 if defined $7;
+       my($attrsbody)=$12;
+       # catch-array destroyed here!
+       $publ.=s/\<br\>//g;
+       $rec{"Publication"}=$publ if $publ ne "";
        ($rec{"id"}=$file)=~s#^($id)\.html$#$1#os or failed($file);
        $rec{"html"}=~s#$ipniservletwordthree#<a href="$1/$1$2.html">$3</a>#osg if $import_xlate && exists $rec{"html"};
+       $rec{"refs"}=[];
        my $score=0; # -: upper, +: lower
        while (defined($attrsbody) && $attrsbody=~s%^(?:$attrpat)%%os) {
                # nomenclatural synonym: id=$2
@@ -98,15 +104,15 @@ remarks: .*)?(<HR><h4>Type</h4>)?(<table $any</table>)?(?:
                # later publication: id=$16
                # Is a later publication: id=$18
                my(@refs)=($2,$5,$8,$11,$13,$16,$18);
-               $rec{"refs"}=[];
                while (@refs) {
                        push(@{$rec{"refs"}},$_) if defined ($_=shift @refs);
                        }
-               $score+=-10 if defined  $5; # basionym: id=$5
-               $score+=+10 if defined $13; # Is a basionym: id=$13
-               $score+=+10 if defined $16; # later publication: id=$16
-               $score+=-10 if defined $18; # Is a later publication: id=$18
+               $score+=+10 if defined  $5; # basionym: id=$5
+               $score+=-10 if defined $13; # Is a basionym: id=$13
+               $score+=- 4 if defined $16; # later publication: id=$16
+               $score+=+ 4 if defined $18; # Is a later publication: id=$18
                }
+       $score+=+9 if $rec{"Publ. Author"}=~/^\s*\(/;
        $rec{"score"}=$score;
        if ($attrsbody) {
                failed($file);
@@ -141,18 +147,15 @@ my( $entry )=@_;
        process_file($entry) if -f $entry;
 }
 
-%OWNS=();
-
 foreach (@ARGV)
        { process_entry($_); }
 
-my($id);
-for $id (keys %DB) {
+for my $id (keys %DB) {
        my($refid);
        my(@refs);
        for $refid (@{$DB{$id}{"refs"}}) {
                if (!exists $DB{$refid}) {
-                       warn "Undefined ref id \"$refid\" from id \"$id\"" if $D;
+                       print "Undefined ref id \"$refid\" from id \"$id\"\n" if $D>=1;
                        next;
                        }
                next if $id eq $refid;  # self-ref
@@ -162,28 +165,43 @@ for $id (keys %DB) {
        $DB{$id}{"refs"}=\@refs;
        }
 
-print Data::Dumper->Dump([\%DB],["%DB"]) if $D;
+print Data::Dumper->Dump([\%DB],["%DB"]) if $D>=2;
+
+my %OWNS=map { $_=>$DB{$_}{"refs"}; } keys(%DB);
+my %NAMES=();
 
-%OWNS=map { $_=>[] } keys(%DB);
+for my $id (keys %DB) {
+       $NAMES{$DB{$id}{"name"}}=[] if !exists $NAMES{$DB{$id}{"name"}};
+       push @{$NAMES{$DB{$id}{"name"}}},$id;
+       }
 
-for $id (keys %OWNS) {
+for my $id (keys %DB) {
        next if !exists $OWNS{$id};
-       for my $refid (@{$DB{$id}{"refs"}}) {
+       my @queue=($id,@{$OWNS{$id}});
+       $OWNS{$id}=[];
+       while (my $refid=shift @queue) {
+               if (exists $NAMES{$DB{$refid}{"name"}}) {
+                       push(@queue,@{$NAMES{$DB{$refid}{"name"}}});
+                       delete $NAMES{$DB{$refid}{"name"}};
+                       }
+               next if $refid eq $id;
                next if !exists $OWNS{$refid};
-               push(@{$OWNS{$id}},$refid,@{$OWNS{$refid}});
+               push @queue,@{$OWNS{$refid}};
                delete $OWNS{$refid};
-               print "connected: id=$id,refid=$refid\n" if $D;
+               print "processed connect id=$id <- refid=$refid\n" if $D>=1;
+               push @{$OWNS{$id}},$refid;
                }
        }
 
-print Data::Dumper->Dump([\%OWNS],["%OWNS"]) if $D;
+print Data::Dumper->Dump([\%OWNS],["%OWNS"]) if $D>=2;
 
 foreach $key (keys %OWNS) {
        my(@keys)=@{$OWNS{$key}};
        delete($OWNS{$key});
        unshift(@keys,$key);
-       @keys=sort { $DB{$a}{"score"} <=> $DB{$b}{"score"}
-                       || extract_year($DB{$b}{"Publication"}) <=> extract_year($DB{$a}{"Publication"}); } @keys;
+#      @keys=sort { $DB{$a}{"score"} <=> $DB{$b}{"score"}
+#                      || extract_year($DB{$b}{"Publication"}) <=> extract_year($DB{$a}{"Publication"}); } @keys;
+       @keys=sort { $DB{$a}{"name"} cmp $DB{$b}{"name"}; } @keys;
        my($pkey)=shift(@keys);
        $OWNS{$pkey}=\@keys;
        }
@@ -194,7 +212,8 @@ my($tb_tree);
 $db_driver="mysql";
 $db_host="";
 $db_user="short";
-$DB_PWD=$ENV{"HOME"}."/priv/mysql.${db_user}.pwd";
+#$DB_PWD=$ENV{"HOME"}."/priv/mysql.${db_user}.pwd";
+$db_pwd="short";
 $db_name="short";
 $tb_tree="kewensis_tree";
 
@@ -216,7 +235,6 @@ eval { &db_do("drop table $tb_tree") };
                ."name varchar(100) not null,"
                ."PublAuthor text null,"
                ."Publication text null,"
-               ."Notes text null,"
                ."html text null"
                .")");
 
@@ -224,7 +242,7 @@ eval { &db_do("drop table $tb_tree") };
 &db_do("alter table $tb_tree add index (name)");
 &db_do("alter table $tb_tree add unique (family_id,family_order)");
 
-my $insert_tb_tree=$db->prepare("insert into $tb_tree (id,family_id,family_order,name,PublAuthor,Publication,Notes,html) values (?,?,?,?,?,?,?)")
+my $insert_tb_tree=$db->prepare("insert into $tb_tree (id,family_id,family_order,name,PublAuthor,Publication,html) values (?,?,?,?,?,?,?)")
                or die "Prepare fail: $!";
 
 foreach $owner (sort { $DB{$b}{"name"} cmp $DB{$a}{"name"}; } keys %OWNS) {
@@ -233,9 +251,9 @@ foreach $owner (sort { $DB{$b}{"name"} cmp $DB{$a}{"name"}; } keys %OWNS) {
        my $family_id=$DB{$owner}{"id"};
        for my $family_order (0..$#family) {
                my $id=$family[$family_order];
-               print "insert:$id,".$DB{$id}{"name"}."\n" if $D;
+               print "insert:$id,".$DB{$id}{"name"}."\n" if $D>=2;
                $insert_tb_tree->execute($id,$family_id,$family_order,
-                               $DB{$id}{"name"},$DB{$id}{"Publ. Author"},$DB{$id}{"Publication"},$DB{$id}{"Notes"},$DB{$id}{"html"}
+                               $DB{$id}{"name"},$DB{$id}{"Publ. Author"},$DB{$id}{"Publication"},$DB{$id}{"html"}
                                ) or die "SQL insert failure: $!";
                }
        }