X-Git-Url: https://git.jankratochvil.net/?p=kewensis.git;a=blobdiff_plain;f=kewensis-collect.pl;h=61b6d884c708cb376a6844f5feca63ae2bea241b;hp=619015323b18c65fac5e7132eb04e3fec8c48ea9;hb=5576eb19ac5c37ed1629abcb68db6b5b528a259b;hpb=71cc8d6abe1ba07d13321c5a9491f0966babfe86 diff --git a/kewensis-collect.pl b/kewensis-collect.pl index 6190153..61b6d88 100755 --- a/kewensis-collect.pl +++ b/kewensis-collect.pl @@ -3,232 +3,257 @@ use strict; use warnings; -use vars qw/%DB %OWNS %PARENT %CURR $last_field $debugmatch $debugparse $key $owner $parent $ref $note_rest $printdupl $maxsize $fileno $fileid $filename/; +use Cwd; +use Data::Dumper; +use DBI; + +my(%DB,$debugmatch,$D,$key,$owner,$ref,$filename,$doimport,$import_xlate); -$maxsize=0x20000; %DB=(); -%CURR=(); -$debugparse=0; -$printdupl=0; +$D=1; $debugmatch=0; +$doimport=1; +$import_xlate=1; -sub name_to_key +sub extract_year { -my( $r )=@_; - - $r=~tr/A-Z/a-z/; - $r=~tr/a-z0-9//cd; - return $r; + ($_)=@_; + if (defined($_)) { + s/\(([0-9]{4}) publ. [0-9]{4}\)/($1)/g; + while (/\(([0-9]{4})(-[0-9]+)?\)/) { + return $1 if ($1>=1700 && $1<=2010); + $_=$'; + } + } + return -1; } -sub rec_to_name +sub failed { -my( %rec )=@_; -my( @list )=("Genus","Species"); -my( $item,$r ); - - $r=""; - while ($item=shift @list) - { $r.=" ".$rec{$item} if exists $rec{$item}; } - $r=~s/^ //; - return $r; -} +my($file)=@_; -sub htmlquote -{ -($_)=@_; - - s/&/&/g; - s//>/g; - s/\n/&nl;/g; - s/"/"/g; - return $_; + print("-$file----------------$'--------------\n"); + warn "Unable to match file \"".getcwd()."/$file\"."; } -sub print_rec +sub process_file { -my( $preinsert,$postinsert,%rec )=@_; - - return if (!%rec); - print OUT "\n"; - print OUT "\n"; - delete $rec{"Genus"}; - delete $rec{"Species"}; - foreach $key (sort keys %rec) - { print OUT "\n"; } - print OUT "
".$preinsert.&rec_to_name(%rec).$postinsert."
".&htmlquote($key).":".&htmlquote($rec{$key})."
\n"; -} +my($file)=@_; -sub flush_CURR -{ -my( $key ); - - $key=&name_to_key(&rec_to_name(%CURR)); - return if (!$key); - delete $CURR{"Family"}; - if (exists $DB{$key}) - { print STDERR "Key \"$key\" is duplicated!\n" if $printdupl; } - else { - print STDERR "ADDKEY: $key\n" if $debugparse; - %{$DB{$key}}=%CURR; + if (!open(FI,$file)) { + warn("Unable to open file \"$file\" in dir \"".getcwd()."\": $!"); + return; } - %CURR=(); - @{$OWNS{$key}}=(); -} - -%OWNS=(); - -while (<>) { - tr/\r\n//d; - if (/^ *\304\301\304/) { - &flush_CURR(); - print STDERR "---\n" if $debugparse; - } - elsif (/^([^³]*[^ ]) *³ *(.*)$/) { - print STDERR "FIELD: $1, VALUE: $2\n" if $debugparse; - $last_field=$1; - $CURR{$1}=$2; + undef $/; + my($fi)=; + close(FI); + my($word)='[^<]*'; + my($bigword)='[^<]*(?:)?[^<]*(?:)?[^<]*(?:<1000 m)?[^<]*'; + my($idone)='[-\d]'; + my($id)="$idone+"; + my($any)='[\x00-\xFF]*'; + my($ipniservletword)="($word)"; + my($idquoted)=$id; $idquoted=~s%\W%\\$&%g; + my($ipniservletwordthree)=$ipniservletword; $ipniservletwordthree=~s%\($idquoted\)%((?:$idone){3})$&%os or die; + my($attrpat)="

nomenclatural synonym(\\(Main Record\\))?:$ipniservletword

|

basionym(\\(\\d+\\))?:$ipniservletword

|

basionym:($word)|

replaced synonym:$ipniservletword

|

replaced synonym:($word)|

Is a replaced synonym of:$ipniservletword

|

Is a basionym of:$ipniservletword

|

later publication of(\\(\\d+\\))?:$ipniservletword

|

Is a later publication of of:$ipniservletword

"; + $|=1; + ""=~/^/; + if ($fi!~m%^ + +IPNI Query Results + + +
Orchidaceae ($word) ($word)
+(?: ($word) ($word)|($bigword(?:
\n$word)*))?(

($bigword)?

($word)?

(?: +remarks: .*)?(

Type

)?()?(?: +

Linked Records

+((?:$attrpat)*))?(
+ +

Original Data

+(?:basionym: ($word)
+)?(?:hybrid parentage: ($word)
+)?(?:replaced synonym: ($word)
+)?(?:distribution: ($word)
+)?(?:Notes: ($word))?)?)(?:

Back to Search Page

+ +)$%os) { + failed($file); + return; } - elsif (/^ *³ *(.*)$/) { - print STDERR "APPEND: $1\n" if $debugparse; - $CURR{$last_field}.=" ".$1; + my(%rec); + $rec{"name"}=$1; + $rec{"Publ. Author"}=$2; + my $publ=""; + $publ.="$4 $5" if defined($4) && defined($5); + $publ.=$6 if defined $6; + $rec{"html"}=$7 if defined $7; + my($attrsbody)=$12; + # catch-array destroyed here! + $publ.=s/\//g; + $rec{"Publication"}=$publ if $publ ne ""; + ($rec{"id"}=$file)=~s#^($id)\.html$#$1#os or failed($file); + $rec{"html"}=~s#$ipniservletwordthree#$3#osg if $import_xlate && exists $rec{"html"}; + $rec{"refs"}=[]; + my $score=0; # -: upper, +: lower + while (defined($attrsbody) && $attrsbody=~s%^(?:$attrpat)%%os) { + # nomenclatural synonym: id=$2 + # basionym: id=$5 + # replaced synonym: id=$8 + # Is a replaced synonym: id=$11 + # Is a basionym: id=$13 + # later publication: id=$16 + # Is a later publication: id=$18 + my(@refs)=($2,$5,$8,$11,$13,$16,$18); + while (@refs) { + push(@{$rec{"refs"}},$_) if defined ($_=shift @refs); + } + $score+=+10 if defined $5; # basionym: id=$5 + $score+=-10 if defined $13; # Is a basionym: id=$13 + $score+=- 4 if defined $16; # later publication: id=$16 + $score+=+ 4 if defined $18; # Is a later publication: id=$18 } - else { - print STDERR "DISCARD: $_\n" if $debugparse; + $score+=+9 if $rec{"Publ. Author"}=~/^\s*\(/; + $rec{"score"}=$score; + if ($attrsbody) { + failed($file); + return; } - } -&flush_CURR(); -undef %CURR; + $DB{$rec{"id"}}=\%rec; +} -%PARENT=(); -sub try_reparent +sub process_dir { -my( $reparent,$refkey ); - - $refkey=&name_to_key($ref); - return 0 if ($refkey eq ""); - return 0 if ($parent eq $refkey); - return 0 if !exists $DB{$refkey}; - print STDERR "try_reparent: SUCCESS: key=\"$key\", refkey=\"$refkey\"\n" if $debugmatch; - if ($note_rest) - { $DB{$key}{"Notes"}=$note_rest; } - else - { delete $DB{$key}{"Notes"}; } - - foreach $reparent (@{$OWNS{$parent}}) { - $PARENT{$reparent}=$refkey; - } - @{$OWNS{$parent}}=() if (!defined $OWNS{$parent}); - @{$OWNS{$refkey}}=() if (!defined $OWNS{$refkey}); - @{$OWNS{$refkey}}=(@{$OWNS{$refkey}},$parent,@{$OWNS{$parent}}); - delete $OWNS{$parent}; +my( $dir )=@_; +my( $old )=getcwd(); - return 1; + if (!chdir($dir)) { + warn("Unable to change to dir \"$dir\" from dir \"$old\": $!"); + return; + } + opendir(DIR,".") or die("Cannot open . in \"".getcwd()."\": $!"); + foreach (sort readdir(DIR)) { + process_entry($_) if ($_!~/^\./); + } + closedir(DIR); + chdir($old) or warn("Unable to retreat to dir \"$old\": $!"); } -MATCH: foreach $key (keys %DB) { - $parent=$PARENT{$key}; - $parent=$key if (!defined $parent); - $ref=$DB{$key}{"Notes"}; - next MATCH if (!defined $ref); - $ref=~tr/()//d; - $ref=~s/^=//; - $ref=~s/^O\. *//i; - $ref=~s/^Orchidaceae *//i; - $ref=~s/\. *(.*)$//; - $note_rest=$1; - next MATCH if &try_reparent(); - $ref=$DB{$key}{"Rank"}." $ref" if exists $DB{$key}{"Rank"}; - next MATCH if &try_reparent(); - $ref=$DB{$key}{"Genus"}." $ref" if exists $DB{$key}{"Genus"}; - next MATCH if &try_reparent(); - } -undef %PARENT; - -sub print_header +sub process_entry { -my($header)=@_; - - print OUT -" - - -Kewensis $header - - -\n"; -} - -$fileno=-1; +my( $entry )=@_; -sub OUT_flush -{ - print OUT "\n"; - close(OUT); + process_dir ($entry) if -d $entry; + process_file($entry) if -f $entry; } -my( $filename ); -foreach $owner (sort keys %OWNS) { -my( $child ); +foreach (@ARGV) + { process_entry($_); } - if ($fileno<0 || tell(OUT)>=$maxsize) { - &OUT_flush() if ($fileno>=0); - $fileid=sprintf("%04d",++$fileno); - $filename="kew-$fileid.html"; - open(OUT,">$filename") or die "Cannot open \"$filename\": $!"; - &print_header("chunk $fileid"); +for my $id (keys %DB) { + my($refid); + my(@refs); + for $refid (@{$DB{$id}{"refs"}}) { + if (!exists $DB{$refid}) { + print "Undefined ref id \"$refid\" from id \"$id\"\n" if $D>=1; + next; + } + next if $id eq $refid; # self-ref +# push(@refs,$DB{$refid}); + push(@refs,$refid); } + $DB{$id}{"refs"}=\@refs; + } - if ($debugmatch) { - print STDERR "($owner):"; - foreach $child (@{$OWNS{$owner}}) - { print STDERR " ($child)"; } - print STDERR "\n"; - } - &print_rec("","",%{$DB{$owner}}); - if (@{$OWNS{$owner}}) { - print OUT "
\n"; - foreach $child (reverse @{$OWNS{$owner}}) - { &print_rec("","",%{$DB{$child}}); } - print OUT "
\n"; +print Data::Dumper->Dump([\%DB],["%DB"]) if $D>=2; + +my %OWNS=map { $_=>$DB{$_}{"refs"}; } keys(%DB); +my %NAMES=(); + +for my $id (keys %DB) { + $NAMES{$DB{$id}{"name"}}=[] if !exists $NAMES{$DB{$id}{"name"}}; + push @{$NAMES{$DB{$id}{"name"}}},$id; + } + +for my $id (keys %DB) { + next if !exists $OWNS{$id}; + my @queue=($id,@{$OWNS{$id}}); + $OWNS{$id}=[]; + while (my $refid=shift @queue) { + if (exists $NAMES{$DB{$refid}{"name"}}) { + push(@queue,@{$NAMES{$DB{$refid}{"name"}}}); + delete $NAMES{$DB{$refid}{"name"}}; + } + next if $refid eq $id; + next if !exists $OWNS{$refid}; + push @queue,@{$OWNS{$refid}}; + delete $OWNS{$refid}; + print "processed connect id=$id <- refid=$refid\n" if $D>=1; + push @{$OWNS{$id}},$refid; } - print OUT "\n"; - $DB{$owner}{"_filename"}=$filename; } -&OUT_flush(); -open(OUT,">kew-index.html") or die "Cannot open \"kew-index.html\": $!"; -&print_header("Index"); -sub print_href +print Data::Dumper->Dump([\%OWNS],["%OWNS"]) if $D>=2; + +foreach $key (keys %OWNS) { + my(@keys)=@{$OWNS{$key}}; + delete($OWNS{$key}); + unshift(@keys,$key); +# @keys=sort { $DB{$a}{"score"} <=> $DB{$b}{"score"} +# || extract_year($DB{$b}{"Publication"}) <=> extract_year($DB{$a}{"Publication"}); } @keys; + @keys=sort { $DB{$a}{"name"} cmp $DB{$b}{"name"}; } @keys; + my($pkey)=shift(@keys); + $OWNS{$pkey}=\@keys; + } + +my($db_driver,$db_host,$db_user,$db_pwd,$DB_PWD,$db_name,$db); +my($tb_tree); + +$db_driver="mysql"; +$db_host=""; +$db_user="short"; +#$DB_PWD=$ENV{"HOME"}."/priv/mysql.${db_user}.pwd"; +$db_pwd="short"; +$db_name="short"; +$tb_tree="kewensis_tree"; + +$db=DBI->connect("DBI:$db_driver:database=$db_name;host=$db_host",$db_user,$db_pwd) or die "Database open fail: $!"; + +sub db_do { -my($owner)=@_; -my(%rec)=%{$DB{$owner}}; +my( $cmd )=@_; - print OUT &htmlquote(&rec_to_name(%rec)); + $db->do($cmd) or die("SQL command \"$cmd\" failed: $!"); } -foreach $owner (sort keys %OWNS) { -my( $child ); - - print OUT "

"; - &print_href($owner); - print OUT "

"; - if (@{$OWNS{$owner}}) { - print OUT "
    \n"; - foreach $child (reverse @{$OWNS{$owner}}) { - print OUT "
  • "; - &print_href($child); - print OUT "
  • \n"; - } - print OUT "
"; +eval { &db_do("drop table $tb_tree") }; + +&db_do("create table $tb_tree (" + ."id char(10) not null," + ."family_id char(10) not null," + ."family_order int not null," + ."name varchar(100) not null," + ."PublAuthor text null," + ."Publication text null," + ."html text null" + .")"); + +&db_do("alter table $tb_tree add unique (id)"); +&db_do("alter table $tb_tree add index (name)"); +&db_do("alter table $tb_tree add unique (family_id,family_order)"); + +my $insert_tb_tree=$db->prepare("insert into $tb_tree (id,family_id,family_order,name,PublAuthor,Publication,html) values (?,?,?,?,?,?,?)") + or die "Prepare fail: $!"; + +foreach $owner (sort { $DB{$b}{"name"} cmp $DB{$a}{"name"}; } keys %OWNS) { + my @family=@{$OWNS{$owner}}; + unshift(@family,$owner); + my $family_id=$DB{$owner}{"id"}; + for my $family_order (0..$#family) { + my $id=$family[$family_order]; + print "insert:$id,".$DB{$id}{"name"}."\n" if $D>=2; + $insert_tb_tree->execute($id,$family_id,$family_order, + $DB{$id}{"name"},$DB{$id}{"Publ. Author"},$DB{$id}{"Publication"},$DB{$id}{"html"} + ) or die "SQL insert failure: $!"; } - print OUT "\n"; } -OUT_flush();