kewensis-collect.pl

   1 #! /usr/bin/perl
   2
   3 use strict;
   4 use warnings;
   5
   6 use Cwd;
   7 use Data::Dumper;
   8
   9 my(%DB,%OWNS,$debugmatch,$D,$key,$owner,$ref,$maxsize,$filename,$doimport,$import_xlate);
  10
  11 # $maxsize=0x40000;
  12 %DB=();
  13 $D=0;
  14 $debugmatch=0;
  15 $doimport=1;
  16 $import_xlate=1;
  17
  18 sub rec_to_name
  19 {
  20 my( %rec )=@_;
  21
  22         return $rec{"name"};
  23 }
  24
  25 sub htmlquote
  26 {
  27 my($class);
  28 ($_,$class)=@_;
  29
  30         s/&/&amp;/g;
  31         s/</&lt;/g;
  32         s/>/&gt;/g;
  33         s/\n/&nl;/g;
  34         s/"/&quot;/g;
  35         return "<span class=\"$class\">$_</span>" if defined $class;
  36         return $_;
  37 }
  38
  39 sub format_record
  40 {
  41 my( $preinsert,$postinsert,%rec )=@_;
  42
  43         my($r)=$preinsert.htmlquote(rec_to_name(%rec),"name").$postinsert;
  44         $r.="\n".htmlquote($rec{"Publ. Author"},"author") if (exists($rec{"Publ. Author"}));
  45         $r.="<br />\n".htmlquote($rec{"Publication"},"publication") if (exists($rec{"Publication"}));
  46         $r.="<br />\n".htmlquote($rec{"Notes"},"notes") if (exists($rec{"Notes"}));
  47         if ($doimport && exists($rec{"html"})) {
  48                 my($import)="import: [".$rec{"id"}."]";
  49                 $r.="<br />\n<blockquote><!-- BEGIN $import -->\n".$rec{"html"}."\n<!-- END $import --></blockquote>\n";
  50                 }
  51         return $r;
  52 }
  53
  54 sub extract_year
  55 {
  56         ($_)=@_;
  57         if (defined($_)) {
  58                 s/\(([0-9]{4}) publ. [0-9]{4}\)/($1)/g;
  59                 while (/\(([0-9]{4})(-[0-9]+)?\)/) {
  60                         return $1 if ($1>=1700 && $1<=2010);
  61                         $_=$';
  62                         }
  63                 }
  64         return -1;
  65 }
  66
  67 sub failed
  68 {
  69 my($file)=@_;
  70
  71         print("-$file----------------$'--------------\n");
  72         warn "Unable to match file \"".getcwd()."/$file\".";
  73 }
  74
  75 sub process_file
  76 {
  77 my($file)=@_;
  78
  79         if (!open(FI,$file)) {
  80                 warn("Unable to open file \"$file\" in dir \"".getcwd()."\": $!");
  81                 return;
  82                 }
  83         undef $/;
  84         my($fi)=<FI>;
  85         close(FI);
  86         my($word)='[^<]*';
  87   my($bigword)='[^<]*(?:<IT>)?[^<]*(?:<RO>)?[^<]*(?:<1000 m)?[^<]*';
  88         my($idone)='[-\d]';
  89         my($id)="$idone+";
  90         my($any)='[\x00-\xFF]*';
  91         my($ipniservletword)="<a href=\"\\./IpniServlet\\?id=($id)&query_type=by_id\">($word)</a>";
  92         my($idquoted)=$id; $idquoted=~s%\W%\\$&%g;
  93         my($ipniservletwordthree)=$ipniservletword; $ipniservletwordthree=~s%\($idquoted\)%((?:$idone){3})$&%os or die;
  94         my($attrpat)="<p>nomenclatural synonym(\\(Main Record\\))?:$ipniservletword</p>|<p>basionym(\\(\\d+\\))?:$ipniservletword</p>|<p>basionym:($word)|<p>replaced synonym:$ipniservletword</p>|<p>replaced synonym:($word)|<p>Is a replaced synonym of:$ipniservletword</p>|<p>Is a basionym of:$ipniservletword</p>|<p>later publication of(\\(\\d+\\))?:$ipniservletword</p>|<p>Is a later publication of of:$ipniservletword</p>";
  95         $|=1;
  96         "<undef>"=~/^/;
  97         if ($fi!~m%^<html>
  98 <head>
  99 <title>IPNI Query Results</title>
 100 </head>
 101 <body bgcolor="#ffffff" text="#000000" link="#006666" vlink="#008080" alink="#008080">
 102 <HR><b><i>Orchidaceae</i> ($word)</b> ($word) <br>
 103 ((?:<a href="\./PublicationServlet\?id=($id)&query_type=by_id"> ($word)</a> ($word)|$bigword(?:<br>\n$word)*)?<p>($bigword)?</p><p>($word)?</p>(?:
 104 remarks: .*)?(<HR><h4>Type</h4>)?(<table $any</table>)?(?:
 105 <h4>Linked Records</h2>
 106 ((?:$attrpat)*))?(<br>
 107
 108 <h4>Original Data</h2>
 109 (?:basionym: ($word)<br>
 110 )?(?:hybrid parentage: ($word)<br>
 111 )?(?:replaced synonym: ($word)<br>
 112 )?(?:distribution: ($word)<br>
 113 )?(?:Notes: ($word))?)?)(?:<p><a href="\./query_ipni.html">Back to Search Page</a></p>
 114 </body>
 115 </html>)$%os) {
 116                 failed($file);
 117                 return;
 118                 }
 119         my(%rec);
 120         $rec{"name"}=$1;
 121         $rec{"Publ. Author"}=$2;
 122         $rec{"Publication"}="$5 $6" if defined($5) && defined($6);
 123         $rec{"html"}=$3 if defined $3;
 124         my($attrsbody)=$11;
 125         ($rec{"id"}=$file)=~s#^($id)\.html$#$1#os or failed($file);
 126         $rec{"html"}=~s#$ipniservletwordthree#<a href="$1/$1$2.html">$3</a>#osg if $import_xlate && exists $rec{"html"};
 127         while (defined($attrsbody) && $attrsbody=~s%^(?:$attrpat)%%os) {
 128                 # nomenclatural synonym: id=$2
 129                 # basionym: id=$5
 130                 # replaced synonym: id=$8
 131                 # Is a replaced synonym: id=$11
 132                 # Is a basionym: id=$13
 133                 # later publication: id=$16
 134                 # Is a later publication: id=$18
 135                 my(@refs)=($2,$5,$8,$11,$13,$16,$18);
 136                 $rec{"refs"}=[];
 137                 while (@refs) {
 138                         push(@{$rec{"refs"}},$_) if defined ($_=shift @refs);
 139                         }
 140                 }
 141         if ($attrsbody) {
 142                 failed($file);
 143                 return;
 144                 }
 145         $DB{$rec{"id"}}=\%rec;
 146 }
 147
 148
 149 sub process_dir
 150 {
 151 my( $dir )=@_;
 152 my( $old )=getcwd();
 153
 154         if (!chdir($dir)) {
 155                 warn("Unable to change to dir \"$dir\" from dir \"$old\": $!");
 156                 return;
 157                 }
 158         opendir(DIR,".") or die("Cannot open . in \"".getcwd()."\": $!");
 159         foreach (sort readdir(DIR)) {
 160                 process_entry($_) if ($_!~/^\./);
 161                 }
 162         closedir(DIR);
 163         chdir($old) or warn("Unable to retreat to dir \"$old\": $!");
 164 }
 165
 166 sub process_entry
 167 {
 168 my( $entry )=@_;
 169
 170         process_dir ($entry) if -d $entry;
 171         process_file($entry) if -f $entry;
 172 }
 173
 174 %OWNS=();
 175
 176 foreach (@ARGV)
 177         { process_entry($_); }
 178
 179 my($id);
 180 for $id (keys %DB) {
 181         my($refid);
 182         my(@refs);
 183         for $refid (@{$DB{$id}{"refs"}}) {
 184                 if (!exists $DB{$refid}) {
 185                         warn "Undefined ref id \"$refid\" from id \"$id\"" if $D;
 186                         next;
 187                         }
 188                 next if $id eq $refid;  # self-ref
 189 #               push(@refs,$DB{$refid});
 190                 push(@refs,$refid);
 191                 }
 192         $DB{$id}{"refs"}=\@refs;
 193         }
 194
 195 print Data::Dumper->Dump([\%DB],["%DB"]) if $D;
 196
 197 %OWNS=map { $_=>[] } keys(%DB);
 198
 199 for $id (keys %OWNS) {
 200         my($refid);
 201         for $refid (@{$DB{$id}{"refs"}}) {
 202                 next if !exists $OWNS{$refid};
 203                 push(@{$OWNS{$id}},$refid,@{$OWNS{$refid}});
 204                 delete $OWNS{$refid};
 205                 }
 206         }
 207
 208 print Data::Dumper->Dump([\%OWNS],["%OWNS"]) if $D;
 209
 210 #foreach $key (keys %DB) {
 211 #       $DB{$key}{"Publication"}=extract_year($DB{$key}{"Publication"})." ::: ".$DB{$key}{"Publication"}
 212 #                       if (exists($DB{$key}{"Publication"}));
 213 #       }
 214
 215 foreach $key (keys %OWNS) {
 216         my(@keys)=@{$OWNS{$key}};
 217         delete($OWNS{$key});
 218         unshift(@keys,$key);
 219         @keys=sort { extract_year($DB{$b}{"Publication"}) <=> extract_year($DB{$a}{"Publication"}); } @keys;
 220         my($pkey)=shift(@keys);
 221         $OWNS{$pkey}=\@keys;
 222         }
 223
 224 sub print_header
 225 {
 226 my($header)=@_;
 227
 228         print OUT
 229 "<?xml version=\"1.0\" encoding=\"utf-8\"?>
 230 <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">
 231 <html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">
 232 <head><title>Kewensis $header</title>
 233 <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />
 234 <base href=\"".($import_xlate ? "data/" : "http://www.ipni.org/ipni/")."\" />
 235 <style type=\"text/css\"><!--
 236 .name { font-weight: bold; }
 237 .author { font-variant: small-caps; }
 238 .publication { }
 239 .notes { }
 240 --></style>
 241 </head><body>
 242 \n";
 243 }
 244
 245 sub OUT_flush
 246 {
 247         print OUT "</body></html>\n";
 248         close(OUT);
 249 }
 250
 251 if (defined $maxsize) {
 252         my($fileno)=-1;
 253         my($filename,$fileid);
 254         foreach $owner (sort { $DB{$b}{"name"} cmp $DB{$a}{"name"}; } keys %OWNS) {
 255         my( $child );
 256
 257                 if ($fileno<0 || tell(OUT)>=$maxsize) {
 258                         OUT_flush() if ($fileno>=0);
 259                         $fileid=sprintf("%04d",++$fileno);
 260                         $filename="kew-$fileid.html";
 261                         open(OUT,">$filename") or die "Cannot open \"$filename\": $!";
 262                         print_header("chunk $fileid");
 263                         }
 264
 265                 if ($debugmatch) {
 266                         print STDERR "($owner):";
 267                         foreach $child (@{$OWNS{$owner}})
 268                                 { print STDERR " ($child)"; }
 269                         print STDERR "\n";
 270                         }
 271                 print OUT format_record("<p><a id=\"".htmlquote($owner)."\">","</a></p>\n",%{$DB{$owner}});
 272                 if (@{$OWNS{$owner}}) {
 273                         print OUT "<blockquote>\n";
 274                         foreach $child (@{$OWNS{$owner}})
 275                                 { print OUT format_record("<p>","</p>\n",%{$DB{$child}}); }
 276                         print OUT "</blockquote>\n";
 277                         }
 278                 print OUT "\n";
 279                 $DB{$owner}{"_filename"}=$filename;
 280                 }
 281         OUT_flush();
 282         open(OUT,">kew-index.html") or die "Cannot open \"kew-index.html\": $!";
 283         print_header("Index");
 284         }
 285 else {
 286         open(OUT,">kewensis.html") or die "Cannot open \"kewensis.html\": $!";
 287         print_header("Full");
 288         }
 289
 290 sub format_href
 291 {
 292 my($preinsert,$postinsert,%rec)=@_;
 293
 294         return htmlquote(rec_to_name(%rec));
 295 }
 296
 297 my($printrecref)=(defined $maxsize ? \&format_href : \&format_record );
 298
 299 foreach $owner (sort { $DB{$b}{"name"} cmp $DB{$a}{"name"}; } keys %OWNS) {
 300 my( $child );
 301
 302         print OUT "<p>";
 303         print OUT "<a href=\"".$DB{$owner}{"_filename"}."#$owner\">" if defined $maxsize;
 304         print OUT &{$printrecref}("","",%{$DB{$owner}});
 305         print OUT "</a>" if defined $maxsize;
 306         print OUT "</p>";
 307         if (@{$OWNS{$owner}}) {
 308                 print OUT "<ul>\n";
 309                 foreach $child (@{$OWNS{$owner}}) {
 310                         print OUT "<li>";
 311                         print OUT &{$printrecref}("","",%{$DB{$child}});
 312                         print OUT "</li>\n";
 313                         }
 314                 print OUT "</ul>";
 315                 }
 316         print OUT "\n";
 317         }
 318 OUT_flush();