-#! /usr/bin/perl -w
+#! /usr/bin/perl
-use vars qw/%DB %OWNS %PARENT %CURR $last_field $debugmatch $debugparse $key $owner $parent $ref/;
+use strict;
+use warnings;
+
+use Cwd;
+use Data::Dumper;
+use DBI;
+
+my(%DB,$debugmatch,$D,$key,$owner,$ref,$filename,$doimport,$import_xlate);
%DB=();
-%CURR=();
-$debugparse=0;
-$printdupl=0;
+$D=1;
$debugmatch=0;
+$doimport=1;
+$import_xlate=1;
-sub name_to_key
+sub extract_year
{
-my( $r )=@_;
-
- $r=~tr/A-Z/a-z/;
- $r=~tr/a-z0-9//cd;
- return $r;
+ ($_)=@_;
+ if (defined($_)) {
+ s/\(([0-9]{4}) publ. [0-9]{4}\)/($1)/g;
+ while (/\(([0-9]{4})(-[0-9]+)?\)/) {
+ return $1 if ($1>=1700 && $1<=2010);
+ $_=$';
+ }
+ }
+ return -1;
}
-sub rec_to_name
+sub failed
{
-my( %rec )=@_;
-my( @list )=("Rank","Infrafam.","Genus","Species");
-my( $item,$r );
-
- $r="";
- while ($item=shift @list)
- { $r.=" ".$rec{$item} if exists $rec{$item}; }
- $r=~s/^ //;
- return $r;
+my($file)=@_;
+
+ print("-$file----------------$'--------------\n");
+ warn "Unable to match file \"".getcwd()."/$file\".";
}
-sub flush_CURR
+sub process_file
{
-my( $key );
-
- $key=&name_to_key(&rec_to_name(%CURR));
- return if (!defined $key);
- if (exists $DB{$key})
- { print STDERR "Key \"$key\" is duplicated!\n" if $printdupl; }
- else {
- print STDERR "ADDKEY: $key\n" if $debugparse;
- %{$DB{$key}}=%CURR;
+my($file)=@_;
+
+ if (!open(FI,$file)) {
+ warn("Unable to open file \"$file\" in dir \"".getcwd()."\": $!");
+ return;
}
- %CURR=();
-}
+ undef $/;
+ my($fi)=<FI>;
+ close(FI);
+ my($word)='[^<]*';
+ my($bigword)='[^<]*(?:<IT>)?[^<]*(?:<RO>)?[^<]*(?:<1000 m)?[^<]*';
+ my($idone)='[-\d]';
+ my($id)="$idone+";
+ my($any)='[\x00-\xFF]*';
+ my($ipniservletword)="<a href=\"\\./IpniServlet\\?id=($id)&query_type=by_id\">($word)</a>";
+ my($idquoted)=$id; $idquoted=~s%\W%\\$&%g;
+ my($ipniservletwordthree)=$ipniservletword; $ipniservletwordthree=~s%\($idquoted\)%((?:$idone){3})$&%os or die;
+ my($attrpat)="<p>nomenclatural synonym(\\(Main Record\\))?:$ipniservletword</p>|<p>basionym(\\(\\d+\\))?:$ipniservletword</p>|<p>basionym:($word)|<p>replaced synonym:$ipniservletword</p>|<p>replaced synonym:($word)|<p>Is a replaced synonym of:$ipniservletword</p>|<p>Is a basionym of:$ipniservletword</p>|<p>later publication of(\\(\\d+\\))?:$ipniservletword</p>|<p>Is a later publication of of:$ipniservletword</p>";
+ $|=1;
+ "<undef>"=~/^/;
+ if ($fi!~m%^<html>
+<head>
+<title>IPNI Query Results</title>
+</head>
+<body bgcolor="#ffffff" text="#000000" link="#006666" vlink="#008080" alink="#008080">
+<HR><b><i>Orchidaceae</i> ($word)</b> ($word) <br>
+(?:<a href="\./PublicationServlet\?id=($id)&query_type=by_id"> ($word)</a> ($word)|($bigword(?:<br>\n$word)*))?(<p>($bigword)?</p><p>($word)?</p>(?:
+remarks: .*)?(<HR><h4>Type</h4>)?(<table $any</table>)?(?:
+<h4>Linked Records</h2>
+((?:$attrpat)*))?(<br>
-while (<>) {
- tr/\r\n//d;
- if (/^ *\304\301\304/) {
- &flush_CURR();
- print STDERR "---\n" if $debugparse;
+<h4>Original Data</h2>
+(?:basionym: ($word)<br>
+)?(?:hybrid parentage: ($word)<br>
+)?(?:replaced synonym: ($word)<br>
+)?(?:distribution: ($word)<br>
+)?(?:Notes: ($word))?)?)(?:<p><a href="\./query_ipni.html">Back to Search Page</a></p>
+</body>
+</html>)$%os) {
+ failed($file);
+ return;
}
- elsif (/^([^³]*[^ ]) *³ *(.*)$/) {
- print STDERR "FIELD: $1, VALUE: $2\n" if $debugparse;
- $last_field=$1;
- $CURR{$1}=$2;
+ my(%rec);
+ $rec{"name"}=$1;
+ $rec{"Publ. Author"}=$2;
+ my $publ="";
+ $publ.="$4 $5" if defined($4) && defined($5);
+ $publ.=$6 if defined $6;
+ $rec{"html"}=$7 if defined $7;
+ my($attrsbody)=$12;
+ # catch-array destroyed here!
+ $publ.=s/\<br\>//g;
+ $rec{"Publication"}=$publ if $publ ne "";
+ ($rec{"id"}=$file)=~s#^($id)\.html$#$1#os or failed($file);
+ $rec{"html"}=~s#$ipniservletwordthree#<a href="$1/$1$2.html">$3</a>#osg if $import_xlate && exists $rec{"html"};
+ $rec{"refs"}=[];
+ my $score=0; # -: upper, +: lower
+ while (defined($attrsbody) && $attrsbody=~s%^(?:$attrpat)%%os) {
+ # nomenclatural synonym: id=$2
+ # basionym: id=$5
+ # replaced synonym: id=$8
+ # Is a replaced synonym: id=$11
+ # Is a basionym: id=$13
+ # later publication: id=$16
+ # Is a later publication: id=$18
+ my(@refs)=($2,$5,$8,$11,$13,$16,$18);
+ while (@refs) {
+ push(@{$rec{"refs"}},$_) if defined ($_=shift @refs);
+ }
+ $score+=+10 if defined $5; # basionym: id=$5
+ $score+=-10 if defined $13; # Is a basionym: id=$13
+ $score+=- 4 if defined $16; # later publication: id=$16
+ $score+=+ 4 if defined $18; # Is a later publication: id=$18
}
- elsif (/^ *³ *(.*)$/) {
- print STDERR "APPEND: $1\n" if $debugparse;
- $CURR{$last_field}.=" ".$1;
+ $score+=+9 if $rec{"Publ. Author"}=~/^\s*\(/;
+ $rec{"score"}=$score;
+ if ($attrsbody) {
+ failed($file);
+ return;
+ }
+ $DB{$rec{"id"}}=\%rec;
+}
+
+
+sub process_dir
+{
+my( $dir )=@_;
+my( $old )=getcwd();
+
+ if (!chdir($dir)) {
+ warn("Unable to change to dir \"$dir\" from dir \"$old\": $!");
+ return;
}
- else {
- print STDERR "DISCARD: $_\n" if $debugparse;
+ opendir(DIR,".") or die("Cannot open . in \"".getcwd()."\": $!");
+ foreach (sort readdir(DIR)) {
+ process_entry($_) if ($_!~/^\./);
}
+ closedir(DIR);
+ chdir($old) or warn("Unable to retreat to dir \"$old\": $!");
+}
+
+sub process_entry
+{
+my( $entry )=@_;
+
+ process_dir ($entry) if -d $entry;
+ process_file($entry) if -f $entry;
+}
+
+foreach (@ARGV)
+ { process_entry($_); }
+
+for my $id (keys %DB) {
+ my($refid);
+ my(@refs);
+ for $refid (@{$DB{$id}{"refs"}}) {
+ if (!exists $DB{$refid}) {
+ print "Undefined ref id \"$refid\" from id \"$id\"\n" if $D>=1;
+ next;
+ }
+ next if $id eq $refid; # self-ref
+# push(@refs,$DB{$refid});
+ push(@refs,$refid);
+ }
+ $DB{$id}{"refs"}=\@refs;
}
-&flush_CURR();
-undef %CURR;
-%OWNS=();
-%PARENT=();
+print Data::Dumper->Dump([\%DB],["%DB"]) if $D>=2;
-sub try_reparent
-{
-my( $reparent,$refkey );
+my %OWNS=map { $_=>$DB{$_}{"refs"}; } keys(%DB);
+my %NAMES=();
- $refkey=&name_to_key($ref);
- return 0 if ($refkey eq "");
- return 0 if ($parent eq $refkey);
- return 0 if !exists $DB{$refkey};
- print STDERR "try_reparent: SUCCESS: key=\"$key\", refkey=\"$refkey\"\n" if $debugmatch;
+for my $id (keys %DB) {
+ $NAMES{$DB{$id}{"name"}}=[] if !exists $NAMES{$DB{$id}{"name"}};
+ push @{$NAMES{$DB{$id}{"name"}}},$id;
+ }
- foreach $reparent (@{$OWNS{$parent}}) {
- $PARENT{$reparent}=$refkey;
+for my $id (keys %DB) {
+ next if !exists $OWNS{$id};
+ my @queue=($id,@{$OWNS{$id}});
+ $OWNS{$id}=[];
+ while (my $refid=shift @queue) {
+ if (exists $NAMES{$DB{$refid}{"name"}}) {
+ push(@queue,@{$NAMES{$DB{$refid}{"name"}}});
+ delete $NAMES{$DB{$refid}{"name"}};
+ }
+ next if $refid eq $id;
+ next if !exists $OWNS{$refid};
+ push @queue,@{$OWNS{$refid}};
+ delete $OWNS{$refid};
+ print "processed connect id=$id <- refid=$refid\n" if $D>=1;
+ push @{$OWNS{$id}},$refid;
}
- @{$OWNS{$parent}}=() if (!defined $OWNS{$parent});
- @{$OWNS{$refkey}}=() if (!defined $OWNS{$refkey});
- @{$OWNS{$refkey}}=(@{$OWNS{$refkey}},$parent,@{$OWNS{$parent}});
- delete $OWNS{$parent};
+ }
- return 1;
-}
+print Data::Dumper->Dump([\%OWNS],["%OWNS"]) if $D>=2;
-MATCH: foreach $key (keys %DB) {
- $parent=$PARENT{$key};
- $parent=$key if (!defined $parent);
- $ref=$DB{$key}{"Notes"};
- next MATCH if (!defined $ref);
- $ref=~tr/()//d;
- $ref=~s/^=//;
- $ref=~s/^O\. *//i;
- $ref=~s/^Orchidaceae *//i;
- $ref=~s/\..*$//;
- next MATCH if &try_reparent();
- $ref=$DB{$key}{"Rank"}." $ref" if exists $DB{$key}{"Rank"};
- next MATCH if &try_reparent();
- $ref=$DB{$key}{"Genus"}." $ref" if exists $DB{$key}{"Genus"};
- next MATCH if &try_reparent();
+foreach $key (keys %OWNS) {
+ my(@keys)=@{$OWNS{$key}};
+ delete($OWNS{$key});
+ unshift(@keys,$key);
+# @keys=sort { $DB{$a}{"score"} <=> $DB{$b}{"score"}
+# || extract_year($DB{$b}{"Publication"}) <=> extract_year($DB{$a}{"Publication"}); } @keys;
+ @keys=sort { $DB{$a}{"name"} cmp $DB{$b}{"name"}; } @keys;
+ my($pkey)=shift(@keys);
+ $OWNS{$pkey}=\@keys;
}
-undef %PARENT;
-foreach $owner (keys %OWNS) {
-my( $child );
+my($db_driver,$db_host,$db_user,$db_pwd,$DB_PWD,$db_name,$db);
+my($tb_tree);
+
+$db_driver="mysql";
+$db_host="";
+$db_user="short";
+#$DB_PWD=$ENV{"HOME"}."/priv/mysql.${db_user}.pwd";
+$db_pwd="short";
+$db_name="short";
+$tb_tree="kewensis_tree";
+
+$db=DBI->connect("DBI:$db_driver:database=$db_name;host=$db_host",$db_user,$db_pwd) or die "Database open fail: $!";
+
+sub db_do
+{
+my( $cmd )=@_;
+
+ $db->do($cmd) or die("SQL command \"$cmd\" failed: $!");
+}
+
+eval { &db_do("drop table $tb_tree") };
+
+&db_do("create table $tb_tree ("
+ ."id char(10) not null,"
+ ."family_id char(10) not null,"
+ ."family_order int not null,"
+ ."name varchar(100) not null,"
+ ."PublAuthor text null,"
+ ."Publication text null,"
+ ."html text null"
+ .")");
+
+&db_do("alter table $tb_tree add unique (id)");
+&db_do("alter table $tb_tree add index (name)");
+&db_do("alter table $tb_tree add unique (family_id,family_order)");
+
+my $insert_tb_tree=$db->prepare("insert into $tb_tree (id,family_id,family_order,name,PublAuthor,Publication,html) values (?,?,?,?,?,?,?)")
+ or die "Prepare fail: $!";
- if ($debugmatch) {
- print STDERR "($owner):";
- foreach $child (@{$OWNS{$owner}})
- { print STDERR " ($child)"; }
- print STDERR "\n";
+foreach $owner (sort { $DB{$b}{"name"} cmp $DB{$a}{"name"}; } keys %OWNS) {
+ my @family=@{$OWNS{$owner}};
+ unshift(@family,$owner);
+ my $family_id=$DB{$owner}{"id"};
+ for my $family_order (0..$#family) {
+ my $id=$family[$family_order];
+ print "insert:$id,".$DB{$id}{"name"}."\n" if $D>=2;
+ $insert_tb_tree->execute($id,$family_id,$family_order,
+ $DB{$id}{"name"},$DB{$id}{"Publ. Author"},$DB{$id}{"Publication"},$DB{$id}{"html"}
+ ) or die "SQL insert failure: $!";
}
- print "(".&rec_to_name(%{$DB{$owner}}).")";
- foreach $child (@{$OWNS{$owner}})
- { print " (".&rec_to_name(%{$DB{$child}}).")"; }
- print "\n";
}