X-Git-Url: http://git.jankratochvil.net/?p=kewensis.git;a=blobdiff_plain;f=kewensis-collect.pl;h=aba360db8e3b92703f937f85c9d6078896f1ec50;hp=f71d434fd081e9eb486011795d39bdc8f2674913;hb=c34552151797af2bf9fdd40b52026bfbc027ccab;hpb=525245c5f9c7f716b224a04a62fd9b4a3b501bb8
diff --git a/kewensis-collect.pl b/kewensis-collect.pl
index f71d434..aba360d 100755
--- a/kewensis-collect.pl
+++ b/kewensis-collect.pl
@@ -1,12 +1,19 @@
-#! /usr/bin/perl -w
+#! /usr/bin/perl
-use vars qw/%DB %OWNS %PARENT %CURR $last_field $debugmatch $debugparse $key $owner $parent $ref/;
+use strict;
+use warnings;
+use Cwd;
+use Data::Dumper;
+
+my(%DB,%OWNS,$debugmatch,$D,$key,$owner,$ref,$maxsize,$filename,$doimport,$import_xlate);
+
+# $maxsize=0x40000;
%DB=();
-%CURR=();
-$debugparse=0;
-$printdupl=0;
+$D=0;
$debugmatch=0;
+$doimport=1;
+$import_xlate=1;
sub name_to_key
{
@@ -20,106 +27,301 @@ my( $r )=@_;
sub rec_to_name
{
my( %rec )=@_;
-my( @list )=("Rank","Infrafam.","Genus","Species");
-my( $item,$r );
- $r="";
- while ($item=shift @list)
- { $r.=" ".$rec{$item} if exists $rec{$item}; }
- $r=~s/^ //;
+ return $rec{"name"};
+}
+
+sub htmlquote
+{
+my($class);
+($_,$class)=@_;
+
+ s/&/&/g;
+ s/</g;
+ s/>/>/g;
+ s/\n/&nl;/g;
+ s/"/"/g;
+ return "$_" if defined $class;
+ return $_;
+}
+
+sub format_record
+{
+my( $preinsert,$postinsert,%rec )=@_;
+
+ my($r)=$preinsert.htmlquote(rec_to_name(%rec),"name").$postinsert;
+ $r.="\n".htmlquote($rec{"Publ. Author"},"author") if (exists($rec{"Publ. Author"}));
+ $r.="
\n".htmlquote($rec{"Publication"},"publication") if (exists($rec{"Publication"}));
+ $r.="
\n".htmlquote($rec{"Notes"},"notes") if (exists($rec{"Notes"}));
+ if ($doimport && exists($rec{"html"})) {
+ my($import)="import: [".$rec{"id"}."]";
+ $r.="
\n
\n".$rec{"html"}."\n
\n";
+ }
return $r;
}
-sub flush_CURR
+sub extract_year
{
-my( $key );
-
- $key=&name_to_key(&rec_to_name(%CURR));
- return if (!defined $key);
- if (exists $DB{$key})
- { print STDERR "Key \"$key\" is duplicated!\n" if $printdupl; }
- else {
- print STDERR "ADDKEY: $key\n" if $debugparse;
- %{$DB{$key}}=%CURR;
+ ($_)=@_;
+ if (defined($_)) {
+ s/\(([0-9]{4}) publ. [0-9]{4}\)/($1)/g;
+ while (/\(([0-9]{4})(-[0-9]+)?\)/) {
+ return $1 if ($1>=1700 && $1<=2010);
+ $_=$';
+ }
}
- %CURR=();
+ return -1;
+}
+
+sub failed
+{
+my($file)=@_;
+
+ print("-$file----------------$'--------------\n");
+ warn "Unable to match file \"".getcwd()."/$file\".";
}
-while (<>) {
- tr/\r\n//d;
- if (/^ *\304\301\304/) {
- &flush_CURR();
- print STDERR "---\n" if $debugparse;
+sub process_file
+{
+my($file)=@_;
+
+ if (!open(FI,$file)) {
+ warn("Unable to open file \"$file\" in dir \"".getcwd()."\": $!");
+ return;
}
- elsif (/^([^³]*[^ ]) *³ *(.*)$/) {
- print STDERR "FIELD: $1, VALUE: $2\n" if $debugparse;
- $last_field=$1;
- $CURR{$1}=$2;
+ undef $/;
+ my($fi)=;
+ close(FI);
+ my($word)='[^<]*';
+ my($bigword)='[^<]*(?:)?[^<]*(?:)?[^<]*(?:<1000 m)?[^<]*';
+ my($idone)='[-\d]';
+ my($id)="$idone+";
+ my($any)='[\x00-\xFF]*';
+ my($ipniservletword)="($word)";
+ my($idquoted)=$id; $idquoted=~s%\W%\\$&%g;
+ my($ipniservletwordthree)=$ipniservletword; $ipniservletwordthree=~s%\($idquoted\)%((?:$idone){3})$&%os or die;
+ my($attrpat)="nomenclatural synonym(\\(Main Record\\))?:$ipniservletword
|basionym(\\(\\d+\\))?:$ipniservletword
|basionym:($word)|
replaced synonym:$ipniservletword
|replaced synonym:($word)|
Is a replaced synonym of:$ipniservletword
|Is a basionym of:$ipniservletword
|later publication of(\\(\\d+\\))?:$ipniservletword
|Is a later publication of of:$ipniservletword
";
+ $|=1;
+ ""=~/^/;
+ if ($fi!~m%^
+
+IPNI Query Results
+
+
+
Orchidaceae ($word) ($word)
+((?: ($word) ($word)|$bigword(?:
\n$word)*)?($bigword)?
($word)?
(?:
+remarks: .*)?(
Type
)?()?(?:
+Linked Records
+((?:$attrpat)*))?(
+
+Original Data
+(?:basionym: ($word)
+)?(?:hybrid parentage: ($word)
+)?(?:replaced synonym: ($word)
+)?(?:distribution: ($word)
+)?(?:Notes: ($word))?)?)(?:
Back to Search Page
+
+)$%os) {
+ failed($file);
+ return;
}
- elsif (/^ *³ *(.*)$/) {
- print STDERR "APPEND: $1\n" if $debugparse;
- $CURR{$last_field}.=" ".$1;
+ my(%rec);
+ $rec{"name"}=$1;
+ $rec{"Publ. Author"}=$2;
+ $rec{"Publication"}="$5 $6" if defined($5) && defined($6);
+ $rec{"html"}=$3 if defined $3;
+ my($attrsbody)=$11;
+ ($rec{"id"}=$file)=~s#^($id)\.html$#$1#os or failed($file);
+ $rec{"html"}=~s#$ipniservletwordthree#$3#osg if $import_xlate && exists $rec{"html"};
+ while (defined($attrsbody) && $attrsbody=~s%^(?:$attrpat)%%os) {
+ # nomenclatural synonym: id=$2
+ # basionym: id=$5
+ # replaced synonym: id=$8
+ # Is a replaced synonym: id=$11
+ # Is a basionym: id=$13
+ # later publication: id=$16
+ # Is a later publication: id=$18
+ my(@refs)=($2,$5,$8,$11,$13,$16,$18);
+ $rec{"refs"}=[];
+ while (@refs) {
+ push(@{$rec{"refs"}},$_) if defined ($_=shift @refs);
+ }
}
- else {
- print STDERR "DISCARD: $_\n" if $debugparse;
+ if ($attrsbody) {
+ failed($file);
+ return;
}
- }
-&flush_CURR();
-undef %CURR;
+ $DB{$rec{"id"}}=\%rec;
+}
-%OWNS=();
-%PARENT=();
-sub try_reparent
+sub process_dir
{
-my( $reparent,$refkey );
+my( $dir )=@_;
+my( $old )=getcwd();
- $refkey=&name_to_key($ref);
- return 0 if ($refkey eq "");
- return 0 if ($parent eq $refkey);
- return 0 if !exists $DB{$refkey};
- print STDERR "try_reparent: SUCCESS: key=\"$key\", refkey=\"$refkey\"\n" if $debugmatch;
+ if (!chdir($dir)) {
+ warn("Unable to change to dir \"$dir\" from dir \"$old\": $!");
+ return;
+ }
+ opendir(DIR,".") or die("Cannot open . in \"".getcwd()."\": $!");
+ foreach (sort readdir(DIR)) {
+ process_entry($_) if ($_!~/^\./);
+ }
+ closedir(DIR);
+ chdir($old) or warn("Unable to retreat to dir \"$old\": $!");
+}
+
+sub process_entry
+{
+my( $entry )=@_;
+
+ process_dir ($entry) if -d $entry;
+ process_file($entry) if -f $entry;
+}
- foreach $reparent (@{$OWNS{$parent}}) {
- $PARENT{$reparent}=$refkey;
+%OWNS=();
+
+foreach (@ARGV)
+ { process_entry($_); }
+
+my($id);
+for $id (keys %DB) {
+ my($refid);
+ my(@refs);
+ for $refid (@{$DB{$id}{"refs"}}) {
+ if (!exists $DB{$refid}) {
+ warn "Undefined ref id \"$refid\" from id \"$id\"" if $D;
+ next;
+ }
+ next if $id eq $refid; # self-ref
+# push(@refs,$DB{$refid});
+ push(@refs,$refid);
+ }
+ $DB{$id}{"refs"}=\@refs;
+ }
+
+print Data::Dumper->Dump([\%DB],["%DB"]) if $D;
+
+%OWNS=map { $_=>[] } keys(%DB);
+
+for $id (keys %OWNS) {
+ my($refid);
+ for $refid (@{$DB{$id}{"refs"}}) {
+ next if !exists $OWNS{$refid};
+ push(@{$OWNS{$id}},$refid,@{$OWNS{$refid}});
+ delete $OWNS{$refid};
}
- @{$OWNS{$parent}}=() if (!defined $OWNS{$parent});
- @{$OWNS{$refkey}}=() if (!defined $OWNS{$refkey});
- @{$OWNS{$refkey}}=(@{$OWNS{$refkey}},$parent,@{$OWNS{$parent}});
- delete $OWNS{$parent};
+ }
+
+print Data::Dumper->Dump([\%OWNS],["%OWNS"]) if $D;
+
+#foreach $key (keys %DB) {
+# $DB{$key}{"Publication"}=extract_year($DB{$key}{"Publication"})." ::: ".$DB{$key}{"Publication"}
+# if (exists($DB{$key}{"Publication"}));
+# }
- return 1;
+foreach $key (keys %OWNS) {
+ my(@keys)=@{$OWNS{$key}};
+ delete($OWNS{$key});
+ unshift(@keys,$key);
+ @keys=sort { extract_year($DB{$b}{"Publication"}) <=> extract_year($DB{$a}{"Publication"}); } @keys;
+ my($pkey)=shift(@keys);
+ $OWNS{$pkey}=\@keys;
+ }
+
+sub print_header
+{
+my($header)=@_;
+
+ print OUT
+"
+
+
+Kewensis $header
+
+
+
+
+\n";
+}
+
+sub OUT_flush
+{
+ print OUT "\n";
+ close(OUT);
}
-MATCH: foreach $key (keys %DB) {
- $parent=$PARENT{$key};
- $parent=$key if (!defined $parent);
- $ref=$DB{$key}{"Notes"};
- next MATCH if (!defined $ref);
- $ref=~tr/()//d;
- $ref=~s/^=//;
- $ref=~s/^O\. *//i;
- $ref=~s/^Orchidaceae *//i;
- $ref=~s/\..*$//;
- next MATCH if &try_reparent();
- $ref=$DB{$key}{"Rank"}." $ref" if exists $DB{$key}{"Rank"};
- next MATCH if &try_reparent();
- $ref=$DB{$key}{"Genus"}." $ref" if exists $DB{$key}{"Genus"};
- next MATCH if &try_reparent();
+if (defined $maxsize) {
+ my($fileno)=-1;
+ my($filename,$fileid);
+ foreach $owner (sort keys %OWNS) {
+ my( $child );
+
+ if ($fileno<0 || tell(OUT)>=$maxsize) {
+ OUT_flush() if ($fileno>=0);
+ $fileid=sprintf("%04d",++$fileno);
+ $filename="kew-$fileid.html";
+ open(OUT,">$filename") or die "Cannot open \"$filename\": $!";
+ print_header("chunk $fileid");
+ }
+
+ if ($debugmatch) {
+ print STDERR "($owner):";
+ foreach $child (@{$OWNS{$owner}})
+ { print STDERR " ($child)"; }
+ print STDERR "\n";
+ }
+ print OUT format_record("","
\n",%{$DB{$owner}});
+ if (@{$OWNS{$owner}}) {
+ print OUT "\n";
+ foreach $child (@{$OWNS{$owner}})
+ { print OUT format_record("","
\n",%{$DB{$child}}); }
+ print OUT "
\n";
+ }
+ print OUT "\n";
+ $DB{$owner}{"_filename"}=$filename;
+ }
+ OUT_flush();
+ open(OUT,">kew-index.html") or die "Cannot open \"kew-index.html\": $!";
+ print_header("Index");
}
-undef %PARENT;
+else {
+ open(OUT,">kewensis.html") or die "Cannot open \"kewensis.html\": $!";
+ print_header("Full");
+ }
+
+sub format_href
+{
+my($preinsert,$postinsert,%rec)=@_;
+
+ return htmlquote(rec_to_name(%rec));
+}
+
+my($printrecref)=(defined $maxsize ? \&format_href : \&format_record );
-foreach $owner (keys %OWNS) {
+foreach $owner (sort keys %OWNS) {
my( $child );
- if ($debugmatch) {
- print STDERR "($owner):";
- foreach $child (@{$OWNS{$owner}})
- { print STDERR " ($child)"; }
- print STDERR "\n";
+ print OUT "";
+ print OUT "" if defined $maxsize;
+ print OUT &{$printrecref}("","",%{$DB{$owner}});
+ print OUT "" if defined $maxsize;
+ print OUT "
";
+ if (@{$OWNS{$owner}}) {
+ print OUT "\n";
+ foreach $child (@{$OWNS{$owner}}) {
+ print OUT "- ";
+ print OUT &{$printrecref}("","",%{$DB{$child}});
+ print OUT "
\n";
+ }
+ print OUT "
";
}
- print "(".&rec_to_name(%{$DB{$owner}}).")";
- foreach $child (@{$OWNS{$owner}})
- { print " (".&rec_to_name(%{$DB{$child}}).")"; }
- print "\n";
+ print OUT "\n";
}
+OUT_flush();