Parts rewritten to parse ipni.org data
[kewensis.git] / kewensis-collect.pl
1 #! /usr/bin/perl
2
3 use strict;
4 use warnings;
5
6 use Cwd;
7 use Data::Dumper;
8
9 my(%DB,%OWNS,$debugmatch,$D,$key,$owner,$ref,$maxsize,$filename,$doimport,$import_xlate);
10
11 # $maxsize=0x40000;
12 %DB=();
13 $D=0;
14 $debugmatch=0;
15 $doimport=1;
16 $import_xlate=1;
17
18 sub name_to_key
19 {
20 my( $r )=@_;
21
22         $r=~tr/A-Z/a-z/;
23         $r=~tr/a-z0-9//cd;
24         return $r;
25 }
26
27 sub rec_to_name
28 {
29 my( %rec )=@_;
30
31         return $rec{"name"};
32 }
33
34 sub htmlquote
35 {
36 my($class);
37 ($_,$class)=@_;
38
39         s/&/&/g;
40         s/</&lt;/g;
41         s/>/&gt;/g;
42         s/\n/&nl;/g;
43         s/"/&quot;/g;
44         return "<span class=\"$class\">$_</span>" if defined $class;
45         return $_;
46 }
47
48 sub format_record
49 {
50 my( $preinsert,$postinsert,%rec )=@_;
51
52         my($r)=$preinsert.htmlquote(rec_to_name(%rec),"name").$postinsert;
53         $r.="\n".htmlquote($rec{"Publ. Author"},"author") if (exists($rec{"Publ. Author"}));
54         $r.="<br />\n".htmlquote($rec{"Publication"},"publication") if (exists($rec{"Publication"}));
55         $r.="<br />\n".htmlquote($rec{"Notes"},"notes") if (exists($rec{"Notes"}));
56         if ($doimport && exists($rec{"html"})) {
57                 my($import)="import: [".$rec{"id"}."]";
58                 $r.="<br />\n<blockquote><!-- BEGIN $import -->\n".$rec{"html"}."\n<!-- END $import --></blockquote>\n";
59                 }
60         return $r;
61 }
62
63 sub extract_year
64 {
65         ($_)=@_;
66         if (defined($_)) {
67                 s/\(([0-9]{4}) publ. [0-9]{4}\)/($1)/g;
68                 while (/\(([0-9]{4})(-[0-9]+)?\)/) {
69                         return $1 if ($1>=1700 && $1<=2010);
70                         $_=$';
71                         }
72                 }
73         return -1;
74 }
75
76 sub failed
77 {
78 my($file)=@_;
79
80         print("-$file----------------$'--------------\n");
81         warn "Unable to match file \"".getcwd()."/$file\".";
82 }
83
84 sub process_file
85 {
86 my($file)=@_;
87
88         if (!open(FI,$file)) {
89                 warn("Unable to open file \"$file\" in dir \"".getcwd()."\": $!");
90                 return;
91                 }
92         undef $/;
93         my($fi)=<FI>;
94         close(FI);
95         my($word)='[^<]*';
96   my($bigword)='[^<]*(?:<IT>)?[^<]*(?:<RO>)?[^<]*(?:<1000 m)?[^<]*';
97         my($idone)='[-\d]';
98         my($id)="$idone+";
99         my($any)='[\x00-\xFF]*';
100         my($ipniservletword)="<a href=\"\\./IpniServlet\\?id=($id)&query_type=by_id\">($word)</a>";
101         my($idquoted)=$id; $idquoted=~s%\W%\\$&%g;
102         my($ipniservletwordthree)=$ipniservletword; $ipniservletwordthree=~s%\($idquoted\)%((?:$idone){3})$&%os or die;
103         my($attrpat)="<p>nomenclatural synonym(\\(Main Record\\))?:$ipniservletword</p>|<p>basionym(\\(\\d+\\))?:$ipniservletword</p>|<p>basionym:($word)|<p>replaced synonym:$ipniservletword</p>|<p>replaced synonym:($word)|<p>Is a replaced synonym of:$ipniservletword</p>|<p>Is a basionym of:$ipniservletword</p>|<p>later publication of(\\(\\d+\\))?:$ipniservletword</p>|<p>Is a later publication of of:$ipniservletword</p>";
104         $|=1;
105         "<undef>"=~/^/;
106         if ($fi!~m%^<html>
107 <head>
108 <title>IPNI Query Results</title>
109 </head>
110 <body bgcolor="#ffffff" text="#000000" link="#006666" vlink="#008080" alink="#008080">
111 <HR><b><i>Orchidaceae</i> ($word)</b> ($word) <br>
112 ((?:<a href="\./PublicationServlet\?id=($id)&query_type=by_id"> ($word)</a> ($word)|$bigword(?:<br>\n$word)*)?<p>($bigword)?</p><p>($word)?</p>(?:
113 remarks: .*)?(<HR><h4>Type</h4>)?(<table $any</table>)?(?:
114 <h4>Linked Records</h2>
115 ((?:$attrpat)*))?(<br>
116
117 <h4>Original Data</h2>
118 (?:basionym: ($word)<br>
119 )?(?:hybrid parentage: ($word)<br>
120 )?(?:replaced synonym: ($word)<br>
121 )?(?:distribution: ($word)<br>
122 )?(?:Notes: ($word))?)?)(?:<p><a href="\./query_ipni.html">Back to Search Page</a></p>
123 </body>
124 </html>)$%os) {
125                 failed($file);
126                 return;
127                 }
128         my(%rec);
129         $rec{"name"}=$1;
130         $rec{"Publ. Author"}=$2;
131         $rec{"Publication"}="$5 $6" if defined($5) && defined($6);
132         $rec{"html"}=$3 if defined $3;
133         my($attrsbody)=$11;
134         ($rec{"id"}=$file)=~s#^($id)\.html$#$1#os or failed($file);
135         $rec{"html"}=~s#$ipniservletwordthree#<a href="$1/$1$2.html">$3</a>#osg if $import_xlate && exists $rec{"html"};
136         while (defined($attrsbody) && $attrsbody=~s%^(?:$attrpat)%%os) {
137                 # nomenclatural synonym: id=$2
138                 # basionym: id=$5
139                 # replaced synonym: id=$8
140                 # Is a replaced synonym: id=$11
141                 # Is a basionym: id=$13
142                 # later publication: id=$16
143                 # Is a later publication: id=$18
144                 my(@refs)=($2,$5,$8,$11,$13,$16,$18);
145                 $rec{"refs"}=[];
146                 while (@refs) {
147                         push(@{$rec{"refs"}},$_) if defined ($_=shift @refs);
148                         }
149                 }
150         if ($attrsbody) {
151                 failed($file);
152                 return;
153                 }
154         $DB{$rec{"id"}}=\%rec;
155 }
156
157
158 sub process_dir
159 {
160 my( $dir )=@_;
161 my( $old )=getcwd();
162
163         if (!chdir($dir)) {
164                 warn("Unable to change to dir \"$dir\" from dir \"$old\": $!");
165                 return;
166                 }
167         opendir(DIR,".") or die("Cannot open . in \"".getcwd()."\": $!");
168         foreach (sort readdir(DIR)) {
169                 process_entry($_) if ($_!~/^\./);
170                 }
171         closedir(DIR);
172         chdir($old) or warn("Unable to retreat to dir \"$old\": $!");
173 }
174
175 sub process_entry
176 {
177 my( $entry )=@_;
178
179         process_dir ($entry) if -d $entry;
180         process_file($entry) if -f $entry;
181 }
182
183 %OWNS=();
184
185 foreach (@ARGV)
186         { process_entry($_); }
187
188 my($id);
189 for $id (keys %DB) {
190         my($refid);
191         my(@refs);
192         for $refid (@{$DB{$id}{"refs"}}) {
193                 if (!exists $DB{$refid}) {
194                         warn "Undefined ref id \"$refid\" from id \"$id\"" if $D;
195                         next;
196                         }
197                 next if $id eq $refid;  # self-ref
198 #               push(@refs,$DB{$refid});
199                 push(@refs,$refid);
200                 }
201         $DB{$id}{"refs"}=\@refs;
202         }
203
204 print Data::Dumper->Dump([\%DB],["%DB"]) if $D;
205
206 %OWNS=map { $_=>[] } keys(%DB);
207
208 for $id (keys %OWNS) {
209         my($refid);
210         for $refid (@{$DB{$id}{"refs"}}) {
211                 next if !exists $OWNS{$refid};
212                 push(@{$OWNS{$id}},$refid,@{$OWNS{$refid}});
213                 delete $OWNS{$refid};
214                 }
215         }
216
217 print Data::Dumper->Dump([\%OWNS],["%OWNS"]) if $D;
218
219 #foreach $key (keys %DB) {
220 #       $DB{$key}{"Publication"}=extract_year($DB{$key}{"Publication"})." ::: ".$DB{$key}{"Publication"}
221 #                       if (exists($DB{$key}{"Publication"}));
222 #       }
223
224 foreach $key (keys %OWNS) {
225         my(@keys)=@{$OWNS{$key}};
226         delete($OWNS{$key});
227         unshift(@keys,$key);
228         @keys=sort { extract_year($DB{$b}{"Publication"}) <=> extract_year($DB{$a}{"Publication"}); } @keys;
229         my($pkey)=shift(@keys);
230         $OWNS{$pkey}=\@keys;
231         }
232
233 sub print_header
234 {
235 my($header)=@_;
236
237         print OUT
238 "<?xml version=\"1.0\" encoding=\"utf-8\"?>
239 <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">
240 <html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">
241 <head><title>Kewensis $header</title>
242 <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />
243 <base href=\"".($import_xlate ? "data/" : "http://www.ipni.org/ipni/")."\" />
244 <style type=\"text/css\"><!--
245 .name { font-weight: bold; }
246 .author { font-variant: small-caps; }
247 .publication { }
248 .notes { }
249 --></style>
250 </head><body>
251 \n";
252 }
253
254 sub OUT_flush
255 {
256         print OUT "</body></html>\n";
257         close(OUT);
258 }
259
260 if (defined $maxsize) {
261         my($fileno)=-1;
262         my($filename,$fileid);
263         foreach $owner (sort keys %OWNS) {
264         my( $child );
265
266                 if ($fileno<0 || tell(OUT)>=$maxsize) {
267                         OUT_flush() if ($fileno>=0);
268                         $fileid=sprintf("%04d",++$fileno);
269                         $filename="kew-$fileid.html";
270                         open(OUT,">$filename") or die "Cannot open \"$filename\": $!";
271                         print_header("chunk $fileid");
272                         }
273
274                 if ($debugmatch) {
275                         print STDERR "($owner):";
276                         foreach $child (@{$OWNS{$owner}})
277                                 { print STDERR " ($child)"; }
278                         print STDERR "\n";
279                         }
280                 print OUT format_record("<p><a id=\"".htmlquote($owner)."\">","</a></p>\n",%{$DB{$owner}});
281                 if (@{$OWNS{$owner}}) {
282                         print OUT "<blockquote>\n";
283                         foreach $child (@{$OWNS{$owner}})
284                                 { print OUT format_record("<p>","</p>\n",%{$DB{$child}}); }
285                         print OUT "</blockquote>\n";
286                         }
287                 print OUT "\n";
288                 $DB{$owner}{"_filename"}=$filename;
289                 }
290         OUT_flush();
291         open(OUT,">kew-index.html") or die "Cannot open \"kew-index.html\": $!";
292         print_header("Index");
293         }
294 else {
295         open(OUT,">kewensis.html") or die "Cannot open \"kewensis.html\": $!";
296         print_header("Full");
297         }
298
299 sub format_href
300 {
301 my($preinsert,$postinsert,%rec)=@_;
302
303         return htmlquote(rec_to_name(%rec));
304 }
305
306 my($printrecref)=(defined $maxsize ? \&format_href : \&format_record );
307
308 foreach $owner (sort keys %OWNS) {
309 my( $child );
310
311         print OUT "<p>";
312         print OUT "<a href=\"".$DB{$owner}{"_filename"}."#$owner\">" if defined $maxsize;
313         print OUT &{$printrecref}("","",%{$DB{$owner}});
314         print OUT "</a>" if defined $maxsize;
315         print OUT "</p>";
316         if (@{$OWNS{$owner}}) {
317                 print OUT "<ul>\n";
318                 foreach $child (@{$OWNS{$owner}}) {
319                         print OUT "<li>";
320                         print OUT &{$printrecref}("","",%{$DB{$child}});
321                         print OUT "</li>\n";
322                         }
323                 print OUT "</ul>";
324                 }
325         print OUT "\n";
326         }
327 OUT_flush();