619015323b18c65fac5e7132eb04e3fec8c48ea9
[kewensis.git] / kewensis-collect.pl
1 #! /usr/bin/perl
2
3 use strict;
4 use warnings;
5
6 use vars qw/%DB %OWNS %PARENT %CURR $last_field $debugmatch $debugparse $key $owner $parent $ref $note_rest $printdupl $maxsize $fileno $fileid $filename/;
7
8 $maxsize=0x20000;
9 %DB=();
10 %CURR=();
11 $debugparse=0;
12 $printdupl=0;
13 $debugmatch=0;
14
15 sub name_to_key
16 {
17 my( $r )=@_;
18
19         $r=~tr/A-Z/a-z/;
20         $r=~tr/a-z0-9//cd;
21         return $r;
22 }
23
24 sub rec_to_name
25 {
26 my( %rec )=@_;
27 my( @list )=("Genus","Species");
28 my( $item,$r );
29
30         $r="";
31         while ($item=shift @list)
32                 { $r.=" ".$rec{$item} if exists $rec{$item}; }
33         $r=~s/^ //;
34         return $r;
35 }
36
37 sub htmlquote
38 {
39 ($_)=@_;
40
41         s/&/&/g;
42         s/</&lt;/g;
43         s/>/&gt;/g;
44         s/\n/&nl;/g;
45         s/"/&quot;/g;
46         return $_;
47 }
48
49 sub print_rec
50 {
51 my( $preinsert,$postinsert,%rec )=@_;
52
53         return if (!%rec);
54         print OUT "<table border=\"1\" frame=\"border\" rules=\"none\"><col width=\"0*\" /><col width=\"1*\" />\n";
55         print OUT "<tr><th colspan=\"2\">".$preinsert.&rec_to_name(%rec).$postinsert."</th></tr>\n";
56         delete $rec{"Genus"};
57         delete $rec{"Species"};
58         foreach $key (sort keys %rec)
59                 { print OUT "<tr><td class=\"left\">".&htmlquote($key).":</td><td>".&htmlquote($rec{$key})."</td></tr>\n"; }
60         print OUT "</table>\n";
61 }
62
63 sub flush_CURR
64 {
65 my( $key );
66
67         $key=&name_to_key(&rec_to_name(%CURR));
68         return if (!$key);
69         delete $CURR{"Family"};
70         if (exists $DB{$key})
71                 { print STDERR "Key \"$key\" is duplicated!\n" if $printdupl; }
72         else {
73                 print STDERR "ADDKEY: $key\n" if $debugparse;
74                 %{$DB{$key}}=%CURR;
75                 }
76         %CURR=();
77         @{$OWNS{$key}}=();
78 }
79
80 %OWNS=();
81
82 while (<>) {
83         tr/\r\n//d;
84         if (/^ *\304\301\304/) {
85                 &flush_CURR();
86                 print STDERR "---\n" if $debugparse;
87                 }
88         elsif (/^([^³]*[^ ]) *³ *(.*)$/) {
89                 print STDERR "FIELD: $1, VALUE: $2\n" if $debugparse;
90                 $last_field=$1;
91                 $CURR{$1}=$2;
92                 }
93         elsif (/^ *³ *(.*)$/) {
94                 print STDERR "APPEND: $1\n" if $debugparse;
95                 $CURR{$last_field}.=" ".$1;
96                 }
97         else {
98                 print STDERR "DISCARD: $_\n" if $debugparse;
99                 }
100         }
101 &flush_CURR();
102 undef %CURR;
103
104 %PARENT=();
105
106 sub try_reparent
107 {
108 my( $reparent,$refkey );
109
110         $refkey=&name_to_key($ref);
111         return 0 if ($refkey eq "");
112         return 0 if ($parent eq $refkey);
113         return 0 if !exists $DB{$refkey};
114         print STDERR "try_reparent: SUCCESS: key=\"$key\", refkey=\"$refkey\"\n" if $debugmatch;
115         if ($note_rest)
116                 { $DB{$key}{"Notes"}=$note_rest; }
117         else
118                 { delete $DB{$key}{"Notes"}; }
119
120         foreach $reparent (@{$OWNS{$parent}}) {
121                 $PARENT{$reparent}=$refkey;
122                 }
123         @{$OWNS{$parent}}=() if (!defined $OWNS{$parent});
124         @{$OWNS{$refkey}}=() if (!defined $OWNS{$refkey});
125         @{$OWNS{$refkey}}=(@{$OWNS{$refkey}},$parent,@{$OWNS{$parent}});
126         delete $OWNS{$parent};
127
128         return 1;
129 }
130
131 MATCH: foreach $key (keys %DB) {
132         $parent=$PARENT{$key};
133         $parent=$key if (!defined $parent); 
134         $ref=$DB{$key}{"Notes"};
135         next MATCH if (!defined $ref);
136         $ref=~tr/()//d;
137         $ref=~s/^=//;
138         $ref=~s/^O\. *//i;
139         $ref=~s/^Orchidaceae *//i;
140         $ref=~s/\. *(.*)$//;
141         $note_rest=$1;
142         next MATCH if &try_reparent();
143         $ref=$DB{$key}{"Rank"}." $ref" if exists $DB{$key}{"Rank"};
144         next MATCH if &try_reparent();
145         $ref=$DB{$key}{"Genus"}." $ref" if exists $DB{$key}{"Genus"};
146         next MATCH if &try_reparent();
147         }
148 undef %PARENT;
149
150 sub print_header
151 {
152 my($header)=@_;
153
154         print OUT
155 "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>
156 <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">
157 <html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">
158 <head><title>Kewensis $header</title>
159 <style type=\"text/css\"><!--
160 table            { border: ridge; }
161 blockquote table { border: groove; }
162 td      { border: none; }
163 td.left { white-space: nowrap; font-style: italic; }
164 --></style>
165 </head><body>
166 \n";
167 }
168
169 $fileno=-1;
170
171 sub OUT_flush
172 {
173         print OUT "</body></html>\n";
174         close(OUT);
175 }
176
177 my( $filename );
178 foreach $owner (sort keys %OWNS) {
179 my( $child );
180
181         if ($fileno<0 || tell(OUT)>=$maxsize) {
182                 &OUT_flush() if ($fileno>=0);
183                 $fileid=sprintf("%04d",++$fileno);
184                 $filename="kew-$fileid.html";
185                 open(OUT,">$filename") or die "Cannot open \"$filename\": $!";
186                 &print_header("chunk $fileid");
187                 }
188
189         if ($debugmatch) {
190                 print STDERR "($owner):";
191                 foreach $child (@{$OWNS{$owner}})
192                         { print STDERR " ($child)"; }
193                 print STDERR "\n";
194                 }
195         &print_rec("<a name=\"".&htmlquote($owner)."\">","</a>",%{$DB{$owner}});
196         if (@{$OWNS{$owner}}) {
197                 print OUT "<blockquote>\n";
198                 foreach $child (reverse @{$OWNS{$owner}})
199                         { &print_rec("","",%{$DB{$child}}); }
200                 print OUT "</blockquote>\n";
201                 }
202         print OUT "\n";
203         $DB{$owner}{"_filename"}=$filename;
204         }
205 &OUT_flush();
206 open(OUT,">kew-index.html") or die "Cannot open \"kew-index.html\": $!";
207 &print_header("Index");
208
209 sub print_href
210 {
211 my($owner)=@_;
212 my(%rec)=%{$DB{$owner}};
213
214         print OUT &htmlquote(&rec_to_name(%rec));
215 }
216
217 foreach $owner (sort keys %OWNS) {
218 my( $child );
219
220         print OUT "<p><a href=\"".$DB{$owner}{"_filename"}."#$owner\">";
221         &print_href($owner);
222         print OUT "</a></p>";
223         if (@{$OWNS{$owner}}) {
224                 print OUT "<ul>\n";
225                 foreach $child (reverse @{$OWNS{$owner}}) {
226                         print OUT "<li>";
227                         &print_href($child);
228                         print OUT "</li>\n";
229                         }
230                 print OUT "</ul>";
231                 }
232         print OUT "\n";
233         }
234 OUT_flush();