Version developed during Xmas, not tested
[kewensis.git] / kewensis-collect.pl
1 #! /usr/bin/perl
2
3 use strict;
4 use warnings;
5
6 use Cwd;
7 use Data::Dumper;
8 use DBI;
9
10 my(%DB,$debugmatch,$D,$key,$owner,$ref,$filename,$doimport,$import_xlate);
11
12 %DB=();
13 $D=1;
14 $debugmatch=0;
15 $doimport=1;
16 $import_xlate=1;
17
18 sub extract_year
19 {
20         ($_)=@_;
21         if (defined($_)) {
22                 s/\(([0-9]{4}) publ. [0-9]{4}\)/($1)/g;
23                 while (/\(([0-9]{4})(-[0-9]+)?\)/) {
24                         return $1 if ($1>=1700 && $1<=2010);
25                         $_=$';
26                         }
27                 }
28         return -1;
29 }
30
31 sub failed
32 {
33 my($file)=@_;
34
35         print("-$file----------------$'--------------\n");
36         warn "Unable to match file \"".getcwd()."/$file\".";
37 }
38
39 sub process_file
40 {
41 my($file)=@_;
42
43         if (!open(FI,$file)) {
44                 warn("Unable to open file \"$file\" in dir \"".getcwd()."\": $!");
45                 return;
46                 }
47         undef $/;
48         my($fi)=<FI>;
49         close(FI);
50         my($word)='[^<]*';
51   my($bigword)='[^<]*(?:<IT>)?[^<]*(?:<RO>)?[^<]*(?:<1000 m)?[^<]*';
52         my($idone)='[-\d]';
53         my($id)="$idone+";
54         my($any)='[\x00-\xFF]*';
55         my($ipniservletword)="<a href=\"\\./IpniServlet\\?id=($id)&query_type=by_id\">($word)</a>";
56         my($idquoted)=$id; $idquoted=~s%\W%\\$&%g;
57         my($ipniservletwordthree)=$ipniservletword; $ipniservletwordthree=~s%\($idquoted\)%((?:$idone){3})$&%os or die;
58         my($attrpat)="<p>nomenclatural synonym(\\(Main Record\\))?:$ipniservletword</p>|<p>basionym(\\(\\d+\\))?:$ipniservletword</p>|<p>basionym:($word)|<p>replaced synonym:$ipniservletword</p>|<p>replaced synonym:($word)|<p>Is a replaced synonym of:$ipniservletword</p>|<p>Is a basionym of:$ipniservletword</p>|<p>later publication of(\\(\\d+\\))?:$ipniservletword</p>|<p>Is a later publication of of:$ipniservletword</p>";
59         $|=1;
60         "<undef>"=~/^/;
61         if ($fi!~m%^<html>
62 <head>
63 <title>IPNI Query Results</title>
64 </head>
65 <body bgcolor="#ffffff" text="#000000" link="#006666" vlink="#008080" alink="#008080">
66 <HR><b><i>Orchidaceae</i> ($word)</b> ($word) <br>
67 (?:<a href="\./PublicationServlet\?id=($id)&query_type=by_id"> ($word)</a> ($word)|($bigword(?:<br>\n$word)*))?(<p>($bigword)?</p><p>($word)?</p>(?:
68 remarks: .*)?(<HR><h4>Type</h4>)?(<table $any</table>)?(?:
69 <h4>Linked Records</h2>
70 ((?:$attrpat)*))?(<br>
71
72 <h4>Original Data</h2>
73 (?:basionym: ($word)<br>
74 )?(?:hybrid parentage: ($word)<br>
75 )?(?:replaced synonym: ($word)<br>
76 )?(?:distribution: ($word)<br>
77 )?(?:Notes: ($word))?)?)(?:<p><a href="\./query_ipni.html">Back to Search Page</a></p>
78 </body>
79 </html>)$%os) {
80                 failed($file);
81                 return;
82                 }
83         my(%rec);
84         $rec{"name"}=$1;
85         $rec{"Publ. Author"}=$2;
86         my $publ="";
87         $publ.="$4 $5" if defined($4) && defined($5);
88         $publ.=$6 if defined $6;
89         $rec{"html"}=$7 if defined $7;
90         my($attrsbody)=$12;
91         # catch-array destroyed here!
92         $publ.=s/\<br\>//g;
93         $rec{"Publication"}=$publ if $publ ne "";
94         ($rec{"id"}=$file)=~s#^($id)\.html$#$1#os or failed($file);
95         $rec{"html"}=~s#$ipniservletwordthree#<a href="$1/$1$2.html">$3</a>#osg if $import_xlate && exists $rec{"html"};
96         $rec{"refs"}=[];
97         my $score=0; # -: upper, +: lower
98         while (defined($attrsbody) && $attrsbody=~s%^(?:$attrpat)%%os) {
99                 # nomenclatural synonym: id=$2
100                 # basionym: id=$5
101                 # replaced synonym: id=$8
102                 # Is a replaced synonym: id=$11
103                 # Is a basionym: id=$13
104                 # later publication: id=$16
105                 # Is a later publication: id=$18
106                 my(@refs)=($2,$5,$8,$11,$13,$16,$18);
107                 while (@refs) {
108                         push(@{$rec{"refs"}},$_) if defined ($_=shift @refs);
109                         }
110                 $score+=+10 if defined  $5; # basionym: id=$5
111                 $score+=-10 if defined $13; # Is a basionym: id=$13
112                 $score+=- 4 if defined $16; # later publication: id=$16
113                 $score+=+ 4 if defined $18; # Is a later publication: id=$18
114                 }
115         $score+=+9 if $rec{"Publ. Author"}=~/^\s*\(/;
116         $rec{"score"}=$score;
117         if ($attrsbody) {
118                 failed($file);
119                 return;
120                 }
121         $DB{$rec{"id"}}=\%rec;
122 }
123
124
125 sub process_dir
126 {
127 my( $dir )=@_;
128 my( $old )=getcwd();
129
130         if (!chdir($dir)) {
131                 warn("Unable to change to dir \"$dir\" from dir \"$old\": $!");
132                 return;
133                 }
134         opendir(DIR,".") or die("Cannot open . in \"".getcwd()."\": $!");
135         foreach (sort readdir(DIR)) {
136                 process_entry($_) if ($_!~/^\./);
137                 }
138         closedir(DIR);
139         chdir($old) or warn("Unable to retreat to dir \"$old\": $!");
140 }
141
142 sub process_entry
143 {
144 my( $entry )=@_;
145
146         process_dir ($entry) if -d $entry;
147         process_file($entry) if -f $entry;
148 }
149
150 foreach (@ARGV)
151         { process_entry($_); }
152
153 for my $id (keys %DB) {
154         my($refid);
155         my(@refs);
156         for $refid (@{$DB{$id}{"refs"}}) {
157                 if (!exists $DB{$refid}) {
158                         print "Undefined ref id \"$refid\" from id \"$id\"\n" if $D>=1;
159                         next;
160                         }
161                 next if $id eq $refid;  # self-ref
162 #               push(@refs,$DB{$refid});
163                 push(@refs,$refid);
164                 }
165         $DB{$id}{"refs"}=\@refs;
166         }
167
168 print Data::Dumper->Dump([\%DB],["%DB"]) if $D>=2;
169
170 my %OWNS=map { $_=>$DB{$_}{"refs"}; } keys(%DB);
171 my %NAMES=();
172
173 for my $id (keys %DB) {
174         $NAMES{$DB{$id}{"name"}}=[] if !exists $NAMES{$DB{$id}{"name"}};
175         push @{$NAMES{$DB{$id}{"name"}}},$id;
176         }
177
178 for my $id (keys %DB) {
179         next if !exists $OWNS{$id};
180         my @queue=($id,@{$OWNS{$id}});
181         $OWNS{$id}=[];
182         while (my $refid=shift @queue) {
183                 if (exists $NAMES{$DB{$refid}{"name"}}) {
184                         push(@queue,@{$NAMES{$DB{$refid}{"name"}}});
185                         delete $NAMES{$DB{$refid}{"name"}};
186                         }
187                 next if $refid eq $id;
188                 next if !exists $OWNS{$refid};
189                 push @queue,@{$OWNS{$refid}};
190                 delete $OWNS{$refid};
191                 print "processed connect id=$id <- refid=$refid\n" if $D>=1;
192                 push @{$OWNS{$id}},$refid;
193                 }
194         }
195
196 print Data::Dumper->Dump([\%OWNS],["%OWNS"]) if $D>=2;
197
198 foreach $key (keys %OWNS) {
199         my(@keys)=@{$OWNS{$key}};
200         delete($OWNS{$key});
201         unshift(@keys,$key);
202 #       @keys=sort { $DB{$a}{"score"} <=> $DB{$b}{"score"}
203 #                       || extract_year($DB{$b}{"Publication"}) <=> extract_year($DB{$a}{"Publication"}); } @keys;
204         @keys=sort { $DB{$a}{"name"} cmp $DB{$b}{"name"}; } @keys;
205         my($pkey)=shift(@keys);
206         $OWNS{$pkey}=\@keys;
207         }
208
209 my($db_driver,$db_host,$db_user,$db_pwd,$DB_PWD,$db_name,$db);
210 my($tb_tree);
211
212 $db_driver="mysql";
213 $db_host="";
214 $db_user="short";
215 #$DB_PWD=$ENV{"HOME"}."/priv/mysql.${db_user}.pwd";
216 $db_pwd="short";
217 $db_name="short";
218 $tb_tree="kewensis_tree";
219
220 $db=DBI->connect("DBI:$db_driver:database=$db_name;host=$db_host",$db_user,$db_pwd) or die "Database open fail: $!";
221
222 sub db_do
223 {
224 my( $cmd )=@_;
225
226         $db->do($cmd) or die("SQL command \"$cmd\" failed: $!");
227 }
228
229 eval { &db_do("drop table $tb_tree") };
230
231 &db_do("create table $tb_tree ("
232                 ."id char(10) not null,"
233                 ."family_id char(10) not null,"
234                 ."family_order int not null,"
235                 ."name varchar(100) not null,"
236                 ."PublAuthor text null,"
237                 ."Publication text null,"
238                 ."html text null"
239                 .")");
240
241 &db_do("alter table $tb_tree add unique (id)");
242 &db_do("alter table $tb_tree add index (name)");
243 &db_do("alter table $tb_tree add unique (family_id,family_order)");
244
245 my $insert_tb_tree=$db->prepare("insert into $tb_tree (id,family_id,family_order,name,PublAuthor,Publication,html) values (?,?,?,?,?,?,?)")
246                 or die "Prepare fail: $!";
247
248 foreach $owner (sort { $DB{$b}{"name"} cmp $DB{$a}{"name"}; } keys %OWNS) {
249         my @family=@{$OWNS{$owner}};
250         unshift(@family,$owner);
251         my $family_id=$DB{$owner}{"id"};
252         for my $family_order (0..$#family) {
253                 my $id=$family[$family_order];
254                 print "insert:$id,".$DB{$id}{"name"}."\n" if $D>=2;
255                 $insert_tb_tree->execute($id,$family_id,$family_order,
256                                 $DB{$id}{"name"},$DB{$id}{"Publ. Author"},$DB{$id}{"Publication"},$DB{$id}{"html"}
257                                 ) or die "SQL insert failure: $!";
258                 }
259         }