Records are now again properly sorted by "name" (not "id")
[kewensis.git] / kewensis-collect.pl
1 #! /usr/bin/perl
2
3 use strict;
4 use warnings;
5
6 use Cwd;
7 use Data::Dumper;
8
9 my(%DB,%OWNS,$debugmatch,$D,$key,$owner,$ref,$maxsize,$filename,$doimport,$import_xlate);
10
11 # $maxsize=0x40000;
12 %DB=();
13 $D=0;
14 $debugmatch=0;
15 $doimport=1;
16 $import_xlate=1;
17
18 sub rec_to_name
19 {
20 my( %rec )=@_;
21
22         return $rec{"name"};
23 }
24
25 sub htmlquote
26 {
27 my($class);
28 ($_,$class)=@_;
29
30         s/&/&/g;
31         s/</&lt;/g;
32         s/>/&gt;/g;
33         s/\n/&nl;/g;
34         s/"/&quot;/g;
35         return "<span class=\"$class\">$_</span>" if defined $class;
36         return $_;
37 }
38
39 sub format_record
40 {
41 my( $preinsert,$postinsert,%rec )=@_;
42
43         my($r)=$preinsert.htmlquote(rec_to_name(%rec),"name").$postinsert;
44         $r.="\n".htmlquote($rec{"Publ. Author"},"author") if (exists($rec{"Publ. Author"}));
45         $r.="<br />\n".htmlquote($rec{"Publication"},"publication") if (exists($rec{"Publication"}));
46         $r.="<br />\n".htmlquote($rec{"Notes"},"notes") if (exists($rec{"Notes"}));
47         if ($doimport && exists($rec{"html"})) {
48                 my($import)="import: [".$rec{"id"}."]";
49                 $r.="<br />\n<blockquote><!-- BEGIN $import -->\n".$rec{"html"}."\n<!-- END $import --></blockquote>\n";
50                 }
51         return $r;
52 }
53
54 sub extract_year
55 {
56         ($_)=@_;
57         if (defined($_)) {
58                 s/\(([0-9]{4}) publ. [0-9]{4}\)/($1)/g;
59                 while (/\(([0-9]{4})(-[0-9]+)?\)/) {
60                         return $1 if ($1>=1700 && $1<=2010);
61                         $_=$';
62                         }
63                 }
64         return -1;
65 }
66
67 sub failed
68 {
69 my($file)=@_;
70
71         print("-$file----------------$'--------------\n");
72         warn "Unable to match file \"".getcwd()."/$file\".";
73 }
74
75 sub process_file
76 {
77 my($file)=@_;
78
79         if (!open(FI,$file)) {
80                 warn("Unable to open file \"$file\" in dir \"".getcwd()."\": $!");
81                 return;
82                 }
83         undef $/;
84         my($fi)=<FI>;
85         close(FI);
86         my($word)='[^<]*';
87   my($bigword)='[^<]*(?:<IT>)?[^<]*(?:<RO>)?[^<]*(?:<1000 m)?[^<]*';
88         my($idone)='[-\d]';
89         my($id)="$idone+";
90         my($any)='[\x00-\xFF]*';
91         my($ipniservletword)="<a href=\"\\./IpniServlet\\?id=($id)&query_type=by_id\">($word)</a>";
92         my($idquoted)=$id; $idquoted=~s%\W%\\$&%g;
93         my($ipniservletwordthree)=$ipniservletword; $ipniservletwordthree=~s%\($idquoted\)%((?:$idone){3})$&%os or die;
94         my($attrpat)="<p>nomenclatural synonym(\\(Main Record\\))?:$ipniservletword</p>|<p>basionym(\\(\\d+\\))?:$ipniservletword</p>|<p>basionym:($word)|<p>replaced synonym:$ipniservletword</p>|<p>replaced synonym:($word)|<p>Is a replaced synonym of:$ipniservletword</p>|<p>Is a basionym of:$ipniservletword</p>|<p>later publication of(\\(\\d+\\))?:$ipniservletword</p>|<p>Is a later publication of of:$ipniservletword</p>";
95         $|=1;
96         "<undef>"=~/^/;
97         if ($fi!~m%^<html>
98 <head>
99 <title>IPNI Query Results</title>
100 </head>
101 <body bgcolor="#ffffff" text="#000000" link="#006666" vlink="#008080" alink="#008080">
102 <HR><b><i>Orchidaceae</i> ($word)</b> ($word) <br>
103 ((?:<a href="\./PublicationServlet\?id=($id)&query_type=by_id"> ($word)</a> ($word)|$bigword(?:<br>\n$word)*)?<p>($bigword)?</p><p>($word)?</p>(?:
104 remarks: .*)?(<HR><h4>Type</h4>)?(<table $any</table>)?(?:
105 <h4>Linked Records</h2>
106 ((?:$attrpat)*))?(<br>
107
108 <h4>Original Data</h2>
109 (?:basionym: ($word)<br>
110 )?(?:hybrid parentage: ($word)<br>
111 )?(?:replaced synonym: ($word)<br>
112 )?(?:distribution: ($word)<br>
113 )?(?:Notes: ($word))?)?)(?:<p><a href="\./query_ipni.html">Back to Search Page</a></p>
114 </body>
115 </html>)$%os) {
116                 failed($file);
117                 return;
118                 }
119         my(%rec);
120         $rec{"name"}=$1;
121         $rec{"Publ. Author"}=$2;
122         $rec{"Publication"}="$5 $6" if defined($5) && defined($6);
123         $rec{"html"}=$3 if defined $3;
124         my($attrsbody)=$11;
125         ($rec{"id"}=$file)=~s#^($id)\.html$#$1#os or failed($file);
126         $rec{"html"}=~s#$ipniservletwordthree#<a href="$1/$1$2.html">$3</a>#osg if $import_xlate && exists $rec{"html"};
127         while (defined($attrsbody) && $attrsbody=~s%^(?:$attrpat)%%os) {
128                 # nomenclatural synonym: id=$2
129                 # basionym: id=$5
130                 # replaced synonym: id=$8
131                 # Is a replaced synonym: id=$11
132                 # Is a basionym: id=$13
133                 # later publication: id=$16
134                 # Is a later publication: id=$18
135                 my(@refs)=($2,$5,$8,$11,$13,$16,$18);
136                 $rec{"refs"}=[];
137                 while (@refs) {
138                         push(@{$rec{"refs"}},$_) if defined ($_=shift @refs);
139                         }
140                 }
141         if ($attrsbody) {
142                 failed($file);
143                 return;
144                 }
145         $DB{$rec{"id"}}=\%rec;
146 }
147
148
149 sub process_dir
150 {
151 my( $dir )=@_;
152 my( $old )=getcwd();
153
154         if (!chdir($dir)) {
155                 warn("Unable to change to dir \"$dir\" from dir \"$old\": $!");
156                 return;
157                 }
158         opendir(DIR,".") or die("Cannot open . in \"".getcwd()."\": $!");
159         foreach (sort readdir(DIR)) {
160                 process_entry($_) if ($_!~/^\./);
161                 }
162         closedir(DIR);
163         chdir($old) or warn("Unable to retreat to dir \"$old\": $!");
164 }
165
166 sub process_entry
167 {
168 my( $entry )=@_;
169
170         process_dir ($entry) if -d $entry;
171         process_file($entry) if -f $entry;
172 }
173
174 %OWNS=();
175
176 foreach (@ARGV)
177         { process_entry($_); }
178
179 my($id);
180 for $id (keys %DB) {
181         my($refid);
182         my(@refs);
183         for $refid (@{$DB{$id}{"refs"}}) {
184                 if (!exists $DB{$refid}) {
185                         warn "Undefined ref id \"$refid\" from id \"$id\"" if $D;
186                         next;
187                         }
188                 next if $id eq $refid;  # self-ref
189 #               push(@refs,$DB{$refid});
190                 push(@refs,$refid);
191                 }
192         $DB{$id}{"refs"}=\@refs;
193         }
194
195 print Data::Dumper->Dump([\%DB],["%DB"]) if $D;
196
197 %OWNS=map { $_=>[] } keys(%DB);
198
199 for $id (keys %OWNS) {
200         my($refid);
201         for $refid (@{$DB{$id}{"refs"}}) {
202                 next if !exists $OWNS{$refid};
203                 push(@{$OWNS{$id}},$refid,@{$OWNS{$refid}});
204                 delete $OWNS{$refid};
205                 }
206         }
207
208 print Data::Dumper->Dump([\%OWNS],["%OWNS"]) if $D;
209
210 #foreach $key (keys %DB) {
211 #       $DB{$key}{"Publication"}=extract_year($DB{$key}{"Publication"})." ::: ".$DB{$key}{"Publication"}
212 #                       if (exists($DB{$key}{"Publication"}));
213 #       }
214
215 foreach $key (keys %OWNS) {
216         my(@keys)=@{$OWNS{$key}};
217         delete($OWNS{$key});
218         unshift(@keys,$key);
219         @keys=sort { extract_year($DB{$b}{"Publication"}) <=> extract_year($DB{$a}{"Publication"}); } @keys;
220         my($pkey)=shift(@keys);
221         $OWNS{$pkey}=\@keys;
222         }
223
224 sub print_header
225 {
226 my($header)=@_;
227
228         print OUT
229 "<?xml version=\"1.0\" encoding=\"utf-8\"?>
230 <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">
231 <html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">
232 <head><title>Kewensis $header</title>
233 <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />
234 <base href=\"".($import_xlate ? "data/" : "http://www.ipni.org/ipni/")."\" />
235 <style type=\"text/css\"><!--
236 .name { font-weight: bold; }
237 .author { font-variant: small-caps; }
238 .publication { }
239 .notes { }
240 --></style>
241 </head><body>
242 \n";
243 }
244
245 sub OUT_flush
246 {
247         print OUT "</body></html>\n";
248         close(OUT);
249 }
250
251 if (defined $maxsize) {
252         my($fileno)=-1;
253         my($filename,$fileid);
254         foreach $owner (sort { $DB{$b}{"name"} cmp $DB{$a}{"name"}; } keys %OWNS) {
255         my( $child );
256
257                 if ($fileno<0 || tell(OUT)>=$maxsize) {
258                         OUT_flush() if ($fileno>=0);
259                         $fileid=sprintf("%04d",++$fileno);
260                         $filename="kew-$fileid.html";
261                         open(OUT,">$filename") or die "Cannot open \"$filename\": $!";
262                         print_header("chunk $fileid");
263                         }
264
265                 if ($debugmatch) {
266                         print STDERR "($owner):";
267                         foreach $child (@{$OWNS{$owner}})
268                                 { print STDERR " ($child)"; }
269                         print STDERR "\n";
270                         }
271                 print OUT format_record("<p><a id=\"".htmlquote($owner)."\">","</a></p>\n",%{$DB{$owner}});
272                 if (@{$OWNS{$owner}}) {
273                         print OUT "<blockquote>\n";
274                         foreach $child (@{$OWNS{$owner}})
275                                 { print OUT format_record("<p>","</p>\n",%{$DB{$child}}); }
276                         print OUT "</blockquote>\n";
277                         }
278                 print OUT "\n";
279                 $DB{$owner}{"_filename"}=$filename;
280                 }
281         OUT_flush();
282         open(OUT,">kew-index.html") or die "Cannot open \"kew-index.html\": $!";
283         print_header("Index");
284         }
285 else {
286         open(OUT,">kewensis.html") or die "Cannot open \"kewensis.html\": $!";
287         print_header("Full");
288         }
289
290 sub format_href
291 {
292 my($preinsert,$postinsert,%rec)=@_;
293
294         return htmlquote(rec_to_name(%rec));
295 }
296
297 my($printrecref)=(defined $maxsize ? \&format_href : \&format_record );
298
299 foreach $owner (sort { $DB{$b}{"name"} cmp $DB{$a}{"name"}; } keys %OWNS) {
300 my( $child );
301
302         print OUT "<p>";
303         print OUT "<a href=\"".$DB{$owner}{"_filename"}."#$owner\">" if defined $maxsize;
304         print OUT &{$printrecref}("","",%{$DB{$owner}});
305         print OUT "</a>" if defined $maxsize;
306         print OUT "</p>";
307         if (@{$OWNS{$owner}}) {
308                 print OUT "<ul>\n";
309                 foreach $child (@{$OWNS{$owner}}) {
310                         print OUT "<li>";
311                         print OUT &{$printrecref}("","",%{$DB{$child}});
312                         print OUT "</li>\n";
313                         }
314                 print OUT "</ul>";
315                 }
316         print OUT "\n";
317         }
318 OUT_flush();