Inside-equiv sorting implemented
[kewensis.git] / kewensis-collect.pl
1 #! /usr/bin/perl
2
3 use strict;
4 use warnings;
5
6 use vars qw/%DB %OWNS %PARENT %CURR $last_field $debugmatch $debugparse $key $owner $parent $ref $note_rest $printdupl $maxsize $fileno $fileid $filename/;
7
8 $maxsize=0x40000;
9 %DB=();
10 %CURR=();
11 $debugparse=0;
12 $printdupl=0;
13 $debugmatch=0;
14
15 sub name_to_key
16 {
17 my( $r )=@_;
18
19         $r=~tr/A-Z/a-z/;
20         $r=~tr/a-z0-9//cd;
21         return $r;
22 }
23
24 sub rec_to_name
25 {
26 my( %rec )=@_;
27 my( @list )=("Genus","Species");
28 my( $item,$r );
29
30         $r="";
31         while ($item=shift @list)
32                 { $r.=" ".$rec{$item} if exists $rec{$item}; }
33         $r=~s/^ //;
34         return $r;
35 }
36
37 sub htmlquote
38 {
39 ($_)=@_;
40
41         s/&/&/g;
42         s/</&lt;/g;
43         s/>/&gt;/g;
44         s/\n/&nl;/g;
45         s/"/&quot;/g;
46         return $_;
47 }
48
49 sub format_record
50 {
51 my( $preinsert,$postinsert,%rec )=@_;
52
53         my($r)="<p><span class=\"name\">".$preinsert.&htmlquote(&rec_to_name(%rec)).$postinsert."</span>";
54         $r.="\n".htmlquote($rec{"Publ. Author"}) if (exists($rec{"Publ. Author"}));
55         $r.="<br />\n".htmlquote($rec{"Publication"}) if (exists($rec{"Publication"}));
56         $r.="<br />\n".htmlquote($rec{"Notes"}) if (exists($rec{"Notes"}));
57         $r.="</p>\n";
58         return($r);
59 }
60
61 sub extract_year
62 {
63         ($_)=@_;
64         if (defined($_)) {
65                 s/\(([0-9]{4}) publ. [0-9]{4}\)/($1)/g;
66                 while (/\(([0-9]{4})(-[0-9]+)?\)/) {
67                         return $1 if ($1>=1700 && $1<=2010);
68                         $_=$';
69                         }
70                 }
71         return(-1);
72 }
73
74 sub flush_CURR
75 {
76 my( $key );
77
78         $key=&name_to_key(&rec_to_name(%CURR));
79         return if (!$key);
80         delete $CURR{"Family"};
81         if (exists $DB{$key})
82                 { print STDERR "Key \"$key\" is duplicated!\n" if $printdupl; }
83         else {
84                 print STDERR "ADDKEY: $key\n" if $debugparse;
85                 %{$DB{$key}}=%CURR;
86 #               my($year)=&extract_year($CURR{"Notes"});
87 #               %{$DB{$key}}{"year"}=$year if (defined($year));
88                 }
89         %CURR=();
90         @{$OWNS{$key}}=();
91 }
92
93 %OWNS=();
94
95 while (<>) {
96         tr/\r\n//d;
97         if (/^ *\304\301\304/) {
98                 &flush_CURR();
99                 print STDERR "---\n" if $debugparse;
100                 }
101         elsif (/^([^³]*[^ ]) *³ *(.*)$/) {
102                 print STDERR "FIELD: $1, VALUE: $2\n" if $debugparse;
103                 $last_field=$1;
104                 $CURR{$1}=$2;
105                 }
106         elsif (/^ *³ *(.*)$/) {
107                 print STDERR "APPEND: $1\n" if $debugparse;
108                 $CURR{$last_field}.=" ".$1;
109                 }
110         else {
111                 print STDERR "DISCARD: $_\n" if $debugparse;
112                 }
113         }
114 &flush_CURR();
115 undef %CURR;
116
117 %PARENT=();
118
119 sub try_reparent
120 {
121 my( $reparent,$refkey );
122
123         $refkey=&name_to_key($ref);
124         return 0 if ($refkey eq "");
125         return 0 if ($parent eq $refkey);
126         return 0 if !exists $DB{$refkey};
127         print STDERR "try_reparent: SUCCESS: key=\"$key\", refkey=\"$refkey\"\n" if $debugmatch;
128         if ($note_rest)
129                 { $DB{$key}{"Notes"}=$note_rest; }
130         else
131                 { delete $DB{$key}{"Notes"}; }
132
133         foreach $reparent (@{$OWNS{$parent}}) {
134                 $PARENT{$reparent}=$refkey;
135                 }
136         @{$OWNS{$parent}}=() if (!exists $OWNS{$parent});
137         @{$OWNS{$refkey}}=() if (!exists $OWNS{$refkey});
138         @{$OWNS{$refkey}}=(@{$OWNS{$refkey}},$parent,@{$OWNS{$parent}});
139         delete $OWNS{$parent};
140
141         return 1;
142 }
143
144 MATCH: foreach $key (keys %DB) {
145         $parent=$PARENT{$key};
146         $parent=$key if (!defined $parent); 
147         $ref=$DB{$key}{"Notes"};
148         next MATCH if (!defined $ref);
149         $ref=~tr/()//d;
150         $ref=~s/^=//;
151         $ref=~s/^O\. *//i;
152         $ref=~s/^Orchidaceae *//i;
153         $ref=~s/\. *(.*)$//;
154         $note_rest=$1;
155         next MATCH if &try_reparent();
156         $ref=$DB{$key}{"Rank"}." $ref" if exists $DB{$key}{"Rank"};
157         next MATCH if &try_reparent();
158         $ref=$DB{$key}{"Genus"}." $ref" if exists $DB{$key}{"Genus"};
159         next MATCH if &try_reparent();
160         }
161 undef %PARENT;
162
163 #foreach $key (keys %DB) {
164 #       $DB{$key}{"Publication"}=&extract_year($DB{$key}{"Publication"})." ::: ".$DB{$key}{"Publication"}
165 #                       if (exists($DB{$key}{"Publication"}));
166 #       }
167
168 foreach $key (keys %OWNS) {
169         my(@keys)=@{$OWNS{$key}};
170         delete($OWNS{$key});
171         unshift(@keys,$key);
172         @keys=sort { &extract_year($DB{$b}{"Publication"}) <=> &extract_year($DB{$a}{"Publication"}); } @keys;
173         my($pkey)=shift(@keys);
174         @{$OWNS{$pkey}}=@keys;
175         }
176
177 sub print_header
178 {
179 my($header)=@_;
180
181         print OUT
182 "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>
183 <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">
184 <html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">
185 <head><title>Kewensis $header</title>
186 <style type=\"text/css\"><!--
187 .name { font-weight: bold; }
188 --></style>
189 </head><body>
190 \n";
191 }
192
193 $fileno=-1;
194
195 sub OUT_flush
196 {
197         print OUT "</body></html>\n";
198         close(OUT);
199 }
200
201 my( $filename );
202 foreach $owner (sort keys %OWNS) {
203 my( $child );
204
205         if ($fileno<0 || tell(OUT)>=$maxsize) {
206                 &OUT_flush() if ($fileno>=0);
207                 $fileid=sprintf("%04d",++$fileno);
208                 $filename="kew-$fileid.html";
209                 open(OUT,">$filename") or die "Cannot open \"$filename\": $!";
210                 &print_header("chunk $fileid");
211                 }
212
213         if ($debugmatch) {
214                 print STDERR "($owner):";
215                 foreach $child (@{$OWNS{$owner}})
216                         { print STDERR " ($child)"; }
217                 print STDERR "\n";
218                 }
219         print OUT &format_record("<a id=\"".&htmlquote($owner)."\">","</a>",%{$DB{$owner}});
220         if (@{$OWNS{$owner}}) {
221                 print OUT "<blockquote>\n";
222                 foreach $child (@{$OWNS{$owner}})
223                         { print OUT &format_record("","",%{$DB{$child}}); }
224                 print OUT "</blockquote>\n";
225                 }
226         print OUT "\n";
227         $DB{$owner}{"_filename"}=$filename;
228         }
229 &OUT_flush();
230 open(OUT,">kew-index.html") or die "Cannot open \"kew-index.html\": $!";
231 &print_header("Index");
232
233 sub print_href
234 {
235 my($owner)=@_;
236 my(%rec)=%{$DB{$owner}};
237
238         print OUT &htmlquote(&rec_to_name(%rec));
239 }
240
241 foreach $owner (sort keys %OWNS) {
242 my( $child );
243
244         print OUT "<p><a href=\"".$DB{$owner}{"_filename"}."#$owner\">";
245         &print_href($owner);
246         print OUT "</a></p>";
247         if (@{$OWNS{$owner}}) {
248                 print OUT "<ul>\n";
249                 foreach $child (@{$OWNS{$owner}}) {
250                         print OUT "<li>";
251                         &print_href($child);
252                         print OUT "</li>\n";
253                         }
254                 print OUT "</ul>";
255                 }
256         print OUT "\n";
257         }
258 OUT_flush();