From c34552151797af2bf9fdd40b52026bfbc027ccab Mon Sep 17 00:00:00 2001 From: short <> Date: Sat, 8 Dec 2001 04:31:49 +0000 Subject: [PATCH] Parts rewritten to parse ipni.org data --- kewensis-collect.pl | 337 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 203 insertions(+), 134 deletions(-) diff --git a/kewensis-collect.pl b/kewensis-collect.pl index c6f399b..aba360d 100755 --- a/kewensis-collect.pl +++ b/kewensis-collect.pl @@ -3,14 +3,17 @@ use strict; use warnings; -use vars qw/%DB %OWNS %PARENT %CURR $last_field $debugmatch $debugparse $key $owner $parent $ref $note_rest $printdupl $maxsize $fileno $fileid $filename/; +use Cwd; +use Data::Dumper; -$maxsize=0x40000; +my(%DB,%OWNS,$debugmatch,$D,$key,$owner,$ref,$maxsize,$filename,$doimport,$import_xlate); + +# $maxsize=0x40000; %DB=(); -%CURR=(); -$debugparse=0; -$printdupl=0; +$D=0; $debugmatch=0; +$doimport=1; +$import_xlate=1; sub name_to_key { @@ -24,25 +27,21 @@ my( $r )=@_; sub rec_to_name { my( %rec )=@_; -my( @list )=("Genus","Species"); -my( $item,$r ); - $r=""; - while ($item=shift @list) - { $r.=" ".$rec{$item} if exists $rec{$item}; } - $r=~s/^ //; - return $r; + return $rec{"name"}; } sub htmlquote { -($_)=@_; +my($class); +($_,$class)=@_; s/&/&/g; s//>/g; s/\n/&nl;/g; s/"/"/g; + return "$_" if defined $class; return $_; } @@ -50,12 +49,15 @@ sub format_record { my( $preinsert,$postinsert,%rec )=@_; - my($r)="

".$preinsert.&htmlquote(&rec_to_name(%rec)).$postinsert.""; - $r.="\n".htmlquote($rec{"Publ. Author"}) if (exists($rec{"Publ. Author"})); - $r.="
\n".htmlquote($rec{"Publication"}) if (exists($rec{"Publication"})); - $r.="
\n".htmlquote($rec{"Notes"}) if (exists($rec{"Notes"})); - $r.="

\n"; - return($r); + my($r)=$preinsert.htmlquote(rec_to_name(%rec),"name").$postinsert; + $r.="\n".htmlquote($rec{"Publ. Author"},"author") if (exists($rec{"Publ. Author"})); + $r.="
\n".htmlquote($rec{"Publication"},"publication") if (exists($rec{"Publication"})); + $r.="
\n".htmlquote($rec{"Notes"},"notes") if (exists($rec{"Notes"})); + if ($doimport && exists($rec{"html"})) { + my($import)="import: [".$rec{"id"}."]"; + $r.="
\n
\n".$rec{"html"}."\n
\n"; + } + return $r; } sub extract_year @@ -68,100 +70,154 @@ sub extract_year $_=$'; } } - return(-1); + return -1; } -sub flush_CURR +sub failed { -my( $key ); - - $key=&name_to_key(&rec_to_name(%CURR)); - return if (!$key); - delete $CURR{"Family"}; - if (exists $DB{$key}) - { print STDERR "Key \"$key\" is duplicated!\n" if $printdupl; } - else { - print STDERR "ADDKEY: $key\n" if $debugparse; - %{$DB{$key}}=%CURR; -# my($year)=&extract_year($CURR{"Notes"}); -# %{$DB{$key}}{"year"}=$year if (defined($year)); - } - %CURR=(); - @{$OWNS{$key}}=(); +my($file)=@_; + + print("-$file----------------$'--------------\n"); + warn "Unable to match file \"".getcwd()."/$file\"."; } -%OWNS=(); +sub process_file +{ +my($file)=@_; -while (<>) { - tr/\r\n//d; - if (/^ *\304\301\304/) { - &flush_CURR(); - print STDERR "---\n" if $debugparse; + if (!open(FI,$file)) { + warn("Unable to open file \"$file\" in dir \"".getcwd()."\": $!"); + return; } - elsif (/^([^³]*[^ ]) *³ *(.*)$/) { - print STDERR "FIELD: $1, VALUE: $2\n" if $debugparse; - $last_field=$1; - $CURR{$1}=$2; + undef $/; + my($fi)=; + close(FI); + my($word)='[^<]*'; + my($bigword)='[^<]*(?:)?[^<]*(?:)?[^<]*(?:<1000 m)?[^<]*'; + my($idone)='[-\d]'; + my($id)="$idone+"; + my($any)='[\x00-\xFF]*'; + my($ipniservletword)="($word)"; + my($idquoted)=$id; $idquoted=~s%\W%\\$&%g; + my($ipniservletwordthree)=$ipniservletword; $ipniservletwordthree=~s%\($idquoted\)%((?:$idone){3})$&%os or die; + my($attrpat)="

nomenclatural synonym(\\(Main Record\\))?:$ipniservletword

|

basionym(\\(\\d+\\))?:$ipniservletword

|

basionym:($word)|

replaced synonym:$ipniservletword

|

replaced synonym:($word)|

Is a replaced synonym of:$ipniservletword

|

Is a basionym of:$ipniservletword

|

later publication of(\\(\\d+\\))?:$ipniservletword

|

Is a later publication of of:$ipniservletword

"; + $|=1; + ""=~/^/; + if ($fi!~m%^ + +IPNI Query Results + + +
Orchidaceae ($word) ($word)
+((?: ($word) ($word)|$bigword(?:
\n$word)*)?

($bigword)?

($word)?

(?: +remarks: .*)?(

Type

)?()?(?: +

Linked Records

+((?:$attrpat)*))?(
+ +

Original Data

+(?:basionym: ($word)
+)?(?:hybrid parentage: ($word)
+)?(?:replaced synonym: ($word)
+)?(?:distribution: ($word)
+)?(?:Notes: ($word))?)?)(?:

Back to Search Page

+ +)$%os) { + failed($file); + return; } - elsif (/^ *³ *(.*)$/) { - print STDERR "APPEND: $1\n" if $debugparse; - $CURR{$last_field}.=" ".$1; + my(%rec); + $rec{"name"}=$1; + $rec{"Publ. Author"}=$2; + $rec{"Publication"}="$5 $6" if defined($5) && defined($6); + $rec{"html"}=$3 if defined $3; + my($attrsbody)=$11; + ($rec{"id"}=$file)=~s#^($id)\.html$#$1#os or failed($file); + $rec{"html"}=~s#$ipniservletwordthree#$3#osg if $import_xlate && exists $rec{"html"}; + while (defined($attrsbody) && $attrsbody=~s%^(?:$attrpat)%%os) { + # nomenclatural synonym: id=$2 + # basionym: id=$5 + # replaced synonym: id=$8 + # Is a replaced synonym: id=$11 + # Is a basionym: id=$13 + # later publication: id=$16 + # Is a later publication: id=$18 + my(@refs)=($2,$5,$8,$11,$13,$16,$18); + $rec{"refs"}=[]; + while (@refs) { + push(@{$rec{"refs"}},$_) if defined ($_=shift @refs); + } } - else { - print STDERR "DISCARD: $_\n" if $debugparse; + if ($attrsbody) { + failed($file); + return; } - } -&flush_CURR(); -undef %CURR; + $DB{$rec{"id"}}=\%rec; +} -%PARENT=(); -sub try_reparent +sub process_dir { -my( $reparent,$refkey ); - - $refkey=&name_to_key($ref); - return 0 if ($refkey eq ""); - return 0 if ($parent eq $refkey); - return 0 if !exists $DB{$refkey}; - print STDERR "try_reparent: SUCCESS: key=\"$key\", refkey=\"$refkey\"\n" if $debugmatch; - if ($note_rest) - { $DB{$key}{"Notes"}=$note_rest; } - else - { delete $DB{$key}{"Notes"}; } - - foreach $reparent (@{$OWNS{$parent}}) { - $PARENT{$reparent}=$refkey; +my( $dir )=@_; +my( $old )=getcwd(); + + if (!chdir($dir)) { + warn("Unable to change to dir \"$dir\" from dir \"$old\": $!"); + return; + } + opendir(DIR,".") or die("Cannot open . in \"".getcwd()."\": $!"); + foreach (sort readdir(DIR)) { + process_entry($_) if ($_!~/^\./); } - @{$OWNS{$parent}}=() if (!exists $OWNS{$parent}); - @{$OWNS{$refkey}}=() if (!exists $OWNS{$refkey}); - @{$OWNS{$refkey}}=(@{$OWNS{$refkey}},$parent,@{$OWNS{$parent}}); - delete $OWNS{$parent}; + closedir(DIR); + chdir($old) or warn("Unable to retreat to dir \"$old\": $!"); +} + +sub process_entry +{ +my( $entry )=@_; - return 1; + process_dir ($entry) if -d $entry; + process_file($entry) if -f $entry; } -MATCH: foreach $key (keys %DB) { - $parent=$PARENT{$key}; - $parent=$key if (!defined $parent); - $ref=$DB{$key}{"Notes"}; - next MATCH if (!defined $ref); - $ref=~tr/()//d; - $ref=~s/^=//; - $ref=~s/^O\. *//i; - $ref=~s/^Orchidaceae *//i; - $ref=~s/\. *(.*)$//; - $note_rest=$1; - next MATCH if &try_reparent(); - $ref=$DB{$key}{"Rank"}." $ref" if exists $DB{$key}{"Rank"}; - next MATCH if &try_reparent(); - $ref=$DB{$key}{"Genus"}." $ref" if exists $DB{$key}{"Genus"}; - next MATCH if &try_reparent(); +%OWNS=(); + +foreach (@ARGV) + { process_entry($_); } + +my($id); +for $id (keys %DB) { + my($refid); + my(@refs); + for $refid (@{$DB{$id}{"refs"}}) { + if (!exists $DB{$refid}) { + warn "Undefined ref id \"$refid\" from id \"$id\"" if $D; + next; + } + next if $id eq $refid; # self-ref +# push(@refs,$DB{$refid}); + push(@refs,$refid); + } + $DB{$id}{"refs"}=\@refs; } -undef %PARENT; + +print Data::Dumper->Dump([\%DB],["%DB"]) if $D; + +%OWNS=map { $_=>[] } keys(%DB); + +for $id (keys %OWNS) { + my($refid); + for $refid (@{$DB{$id}{"refs"}}) { + next if !exists $OWNS{$refid}; + push(@{$OWNS{$id}},$refid,@{$OWNS{$refid}}); + delete $OWNS{$refid}; + } + } + +print Data::Dumper->Dump([\%OWNS],["%OWNS"]) if $D; #foreach $key (keys %DB) { -# $DB{$key}{"Publication"}=&extract_year($DB{$key}{"Publication"})." ::: ".$DB{$key}{"Publication"} +# $DB{$key}{"Publication"}=extract_year($DB{$key}{"Publication"})." ::: ".$DB{$key}{"Publication"} # if (exists($DB{$key}{"Publication"})); # } @@ -169,9 +225,9 @@ foreach $key (keys %OWNS) { my(@keys)=@{$OWNS{$key}}; delete($OWNS{$key}); unshift(@keys,$key); - @keys=sort { &extract_year($DB{$b}{"Publication"}) <=> &extract_year($DB{$a}{"Publication"}); } @keys; + @keys=sort { extract_year($DB{$b}{"Publication"}) <=> extract_year($DB{$a}{"Publication"}); } @keys; my($pkey)=shift(@keys); - @{$OWNS{$pkey}}=@keys; + $OWNS{$pkey}=\@keys; } sub print_header @@ -179,76 +235,89 @@ sub print_header my($header)=@_; print OUT -" +"Kewensis $header + + \n"; } -$fileno=-1; - sub OUT_flush { print OUT "\n"; close(OUT); } -my( $filename ); -foreach $owner (sort keys %OWNS) { -my( $child ); - - if ($fileno<0 || tell(OUT)>=$maxsize) { - &OUT_flush() if ($fileno>=0); - $fileid=sprintf("%04d",++$fileno); - $filename="kew-$fileid.html"; - open(OUT,">$filename") or die "Cannot open \"$filename\": $!"; - &print_header("chunk $fileid"); - } +if (defined $maxsize) { + my($fileno)=-1; + my($filename,$fileid); + foreach $owner (sort keys %OWNS) { + my( $child ); + + if ($fileno<0 || tell(OUT)>=$maxsize) { + OUT_flush() if ($fileno>=0); + $fileid=sprintf("%04d",++$fileno); + $filename="kew-$fileid.html"; + open(OUT,">$filename") or die "Cannot open \"$filename\": $!"; + print_header("chunk $fileid"); + } - if ($debugmatch) { - print STDERR "($owner):"; - foreach $child (@{$OWNS{$owner}}) - { print STDERR " ($child)"; } - print STDERR "\n"; - } - print OUT &format_record("","",%{$DB{$owner}}); - if (@{$OWNS{$owner}}) { - print OUT "
\n"; - foreach $child (@{$OWNS{$owner}}) - { print OUT &format_record("","",%{$DB{$child}}); } - print OUT "
\n"; + if ($debugmatch) { + print STDERR "($owner):"; + foreach $child (@{$OWNS{$owner}}) + { print STDERR " ($child)"; } + print STDERR "\n"; + } + print OUT format_record("

","

\n",%{$DB{$owner}}); + if (@{$OWNS{$owner}}) { + print OUT "
\n"; + foreach $child (@{$OWNS{$owner}}) + { print OUT format_record("

","

\n",%{$DB{$child}}); } + print OUT "
\n"; + } + print OUT "\n"; + $DB{$owner}{"_filename"}=$filename; } - print OUT "\n"; - $DB{$owner}{"_filename"}=$filename; + OUT_flush(); + open(OUT,">kew-index.html") or die "Cannot open \"kew-index.html\": $!"; + print_header("Index"); + } +else { + open(OUT,">kewensis.html") or die "Cannot open \"kewensis.html\": $!"; + print_header("Full"); } -&OUT_flush(); -open(OUT,">kew-index.html") or die "Cannot open \"kew-index.html\": $!"; -&print_header("Index"); -sub print_href +sub format_href { -my($owner)=@_; -my(%rec)=%{$DB{$owner}}; +my($preinsert,$postinsert,%rec)=@_; - print OUT &htmlquote(&rec_to_name(%rec)); + return htmlquote(rec_to_name(%rec)); } +my($printrecref)=(defined $maxsize ? \&format_href : \&format_record ); + foreach $owner (sort keys %OWNS) { my( $child ); - print OUT "

"; - &print_href($owner); - print OUT "

"; + print OUT "

"; + print OUT "" if defined $maxsize; + print OUT &{$printrecref}("","",%{$DB{$owner}}); + print OUT "" if defined $maxsize; + print OUT "

"; if (@{$OWNS{$owner}}) { print OUT "
    \n"; foreach $child (@{$OWNS{$owner}}) { print OUT "
  • "; - &print_href($child); + print OUT &{$printrecref}("","",%{$DB{$child}}); print OUT "
  • \n"; } print OUT "
"; -- 1.8.3.1